In [None]:
import pandas as pd
import numpy as np

import time

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import xgboost as xgb

In [None]:
device = ("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("completed_train_df1.csv").set_index("tconst")

df["Domestic"] = df["Domestic"].fillna(df["Domestic"].quantile(0.25))
df["Foreign"] = df["Foreign"].fillna(df["Foreign"].quantile(0.25))

df["Worldwide"] = df["Worldwide"].fillna(df["Domestic"] + df["Foreign"])

df = df.fillna(df.fillna(0).median())

In [None]:
class dataloader(Dataset):

    def __init__(self, df, kind="train"):
        
        x = df.drop(["label"], axis=1).values.astype(float)
        y = df["label"].values
        
        if kind == "train":
            x = x[:6000]
            y = y[:6000]
        elif kind == "eval":
            x = x[6000:]
            y = y[6000:]

        self.x_train = torch.tensor(x, dtype=torch.float32)
        self.y_train = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y_train)
  
    def __getitem__(self,idx):
        return self.x_train[idx], self.y_train[idx]

In [None]:
data_train = dataloader(df, "train")
data_eval = dataloader(df, "eval")

train_loader = DataLoader(data_train, batch_size=256)
val_loader = DataLoader(data_eval, batch_size=256)

In [None]:
def train_loop(trainloader, valloader, model, criterion, optimizer):
    
    # Train on GPU if available
    model = model.to(device)
    model.train()
    
    for epoch in range(500):
        for batch, (movie, label) in enumerate(trainloader):
            # Compute prediction and loss
            movie, label = movie.to(device), label.to(device)            
            pred = model(movie)
            loss = criterion(pred, label)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

#             # Print progress
#             if batch % 5 == 0:
#                 loss, current = loss.item(), batch * len(movie)
#                 print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}] epoch: {epoch + 1}")

        # Print performance after current number of epochs
        # print(f"Epoch: {epoch + 1}")
        current_acc, avg_loss = test_loop(valloader, model, criterion, epoch, kind="eval")
        if epoch % 100 == 0:
            current_acc_t, avg_loss_t = test_loop(trainloader, model, criterion, epoch, kind="train")
            time.sleep(2.5)
            
    print(f"\n\n Final accuracy (eval): {current_acc}\nFinal accuracy (train): {current_acc_t}\n")

def test_loop(dataloader, model, criterion, epoch, kind="eval"):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for movie, label in dataloader:
            movie, label = movie.to(device), label.to(device)
            pred = model(movie)
            test_loss += criterion(pred, label).item()
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Epoch: {epoch}, {kind} Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}", end="\r")
    return (100 * correct), test_loss

In [None]:
class imdbModel(nn.Module):
    def __init__(self, num_layers, hidden_size, drop_out=False):
        super().__init__()
        
        self.fc1 = nn.Linear(171, hidden_size)
        self.fc_mid = nn.Linear(hidden_size, hidden_size)
        self.fc_final = nn.Linear(hidden_size, 2)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.num_layers = num_layers
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        
        for _ in range(self.num_layers):
            if drop_out:
                x = self.dropout(x)
            x = self.fc_mid(x)
            x = self.relu(x)
            
        x = self.fc_final(x)
                   
        return self.logsoftmax(x)

In [None]:
# for num_layers in [10]:
#     for hidden_size in [1000]:
#         for drop_out in [False, True]:
#             model = imdbModel(num_layers, hidden_size, drop_out=drop_out)

#             optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#             criterion = torch.nn.CrossEntropyLoss()

#             train_loop(train_loader, val_loader, model, criterion, optimizer)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
train_df, valid_df = train_test_split(df, train_size=0.9, shuffle=True, stratify=df['label'], random_state=42)

In [None]:
valid_df['label'] = valid_df['label'].astype('int')

for num_leaves in [10, 30, 50, 100][::-1]:
    for n_estimators in [1, 10, 100, 200][::-1]:
        for max_depth in [-1, 1, 5, 10]:
            for boosting in ["gbdt", "dart", "goss"]:
                print(f"Current: {num_leaves} leaves, {n_estimators} estimators, {max_depth} max depth, {boosting} boosting", end=": ")
                
                model_lgbm = lgb.LGBMClassifier(objective='binary',
                                                boosting=boosting,
                                                learning_rate=0.001,
                                                num_iterations=5000,
                                                num_leaves=num_leaves,
                                                n_estimators=n_estimators,
                                                max_depth=max_depth,
                                                # feature_fraction=0.8,
                                                # verbosity=1,
                                                random_state=17,
                                                n_jobs=-1);

                model_lgbm.fit(train_df.drop("label", axis=1),
                               train_df['label'],
                               eval_metric='logloss')
                
                val_preds = model_lgbm.predict(valid_df.drop("label", axis=1))
                print(accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=val_preds))

In [None]:
# Optimal (from what I found so far) ~ 0.806
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                boosting=boosting,
                                learning_rate=0.001,
                                num_iterations=5000,
                                num_leaves=50,
                                n_estimators=200,
                                max_depth=10,
                                # feature_fraction=0.8,
                                # verbosity=1,
                                random_state=17,
                                n_jobs=-1);

model_lgbm.fit(train_df.drop("label", axis=1),
               train_df['label'],
               eval_metric='logloss')

In [None]:
val_preds = model_lgbm.predict(valid_df.drop("label", axis=1))

In [None]:
accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=val_preds)

In [None]:
dtrain = xgb.DMatrix(train_df.drop("label", axis=1), label=train_df["label"])
dvalid = xgb.DMatrix(valid_df.drop("label", axis=1), label=valid_df["label"])

In [None]:
param = {'booster': "gbtree", 'max_depth': 1, 'eta': 1, 'objective': 'binary:logistic'}

In [None]:
evallist = [(dvalid, 'eval'), (dtrain, 'train')]

In [None]:
num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)

In [None]:
pred = np.round(bst.predict(dvalid), 0)

In [None]:
accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=pred)

In [None]:
df_validation = pd.read_csv("completed_eval_df1.csv").set_index("tconst")

df_validation["Domestic"] = df_validation["Domestic"].fillna(df_validation["Domestic"].quantile(0.25))
df_validation["Foreign"] = df_validation["Foreign"].fillna(df_validation["Foreign"].quantile(0.25))

df_validation["Worldwide"] = df_validation["Worldwide"].fillna(df_validation["Domestic"] + df_validation["Foreign"])

df_validation = df_validation.fillna(df.fillna(0).median()).drop(["label"], axis=1)

In [None]:
df_validation[['Adult', 'Biography', 'Film-Noir', 'Foreign_genre', 'Musical', 'News', 'Sci-Fi', 'Sport']] = 0

In [None]:
df_test = pd.read_csv("completed_test_df1.csv").set_index("tconst")

df_test["Domestic"] = df_test["Domestic"].fillna(df_test["Domestic"].quantile(0.25))
df_test["Foreign"] = df_test["Foreign"].fillna(df_test["Foreign"].quantile(0.25))

df_test["Worldwide"] = df_test["Worldwide"].fillna(df_test["Domestic"] + df_test["Foreign"])

df_test = df_test.fillna(df.fillna(0).median()).drop(["label"], axis=1)

In [None]:
df_test[['Adult', 'Biography', 'Film-Noir', 'Musical', 'News', 'Sci-Fi', 'Sport']] = 0

In [None]:
val_preds_lgbm = model_lgbm.predict(df_validation)
with open('../val_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [None]:
test_preds_lgbm = model_lgbm.predict(df_test)
with open('../test_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [None]:
val_preds_lgbm.mean(), test_preds_lgbm.mean()