In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from tensorflow.keras import layers
from tensorflow import keras
from keras.constraints import maxnorm
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation,  Flatten, Input
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import metrics
from keras.metrics import AUC
from keras.metrics import Precision
from keras.metrics import Recall

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import sklearn
from sklearn.ensemble import StackingClassifier
import pickle

# Data Preprocessing

In [None]:
X_train_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_train_trf1_v3.csv')
X_val_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_val_trf1_v3.csv')
X_test_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_test_trf1_v3.csv')

#Drop rating_month and rating_year columns
#Drop primaryTitle, userId_ori, movieId_ori

X_test_complete = X_test_trf1.loc[:,['primaryTitle', 'userId_ori']]

def dropRatingDate(df):
  cols = [0, 1, 2, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 42]
  return df.drop(df.columns[cols], axis=1)

X_train_trf1 = dropRatingDate(X_train_trf1)
X_test_trf1 = dropRatingDate(X_test_trf1)
X_val_trf1 = dropRatingDate(X_val_trf1)

columns_list = X_train_trf1.columns.tolist()

X_train_trf1 = X_train_trf1.to_numpy()
X_val_trf1 = X_val_trf1.to_numpy()
X_test_trf1 = X_test_trf1.to_numpy() 

y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_train_xg_v3.csv').to_numpy(dtype = 'int')
y_val= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_val_xg_v3.csv').to_numpy(dtype = 'int')
y_test= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_test_xg_v3.csv').to_numpy(dtype = 'int')

In [None]:
#Convert 9bins to 2bins
def nineToTwoBins(y_dataset):
    for i in range(0, len(y_dataset)): 
        if y_dataset[i] < 7:
            y_dataset[i] = 0
        else:
            y_dataset[i] = 1
    return y_dataset

In [None]:
nineToTwoBins(y_train)
nineToTwoBins(y_val)
nineToTwoBins(y_test)

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [None]:
training_data = {'X_train':X_train_trf1,'Y_train':y_train,
                'X_val': X_val_trf1,'Y_val':y_val,
                'X_test': X_test_trf1 ,'Y_test':y_test}

# Stacking and Loading Optimized Models

**Build Model Function based on Bayesian Optimized Model**

In [None]:
def build_model():
  # create model
  model = Sequential()

  # Define input layer, and first hidden layer
  # neurons for first hidden layer
  model.add(Dense(970, input_dim=27, activation='relu')
  )

  # Define 2nd hidden (Dense) layers onward
  # Tune layers, neurons
  for i, neuron in enumerate([970, 970, 20, 970, 970,
                              970, 970, 970, 970, 970,
                              970, 970], start = 2):
    model.add(Dense(units=neuron,
                    activation='relu', 
                    name=f'Hidden{i}'))
    # Add drop out layers to 5th hidden layer
    if i == 4:
      model.add(Dropout(rate=0.25)
      )

  # Define output layer
  model.add(Dense(1, activation='sigmoid')
  )

  # Compile model
  # Tune learning rate at optimizer
  model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=[AUC(),Precision(), Recall()]
                  )
  return model

**XGBClassifier Based on GridSearch Tuned Model**

In [None]:
XG_clf = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=9,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.9,
                    colsample_bytree=0.6,
                    reg_alpha = 0.01,
                    tree_method = "gpu_hist",
                    objective='multi:softmax',
                    num_class=2,
                    seed=27)

XG_clf._estimator_type = "classifier"

NN_clf = KerasClassifier(build_fn=build_model, 
                          batch_size = 500,
                          epochs = 80)
NN_clf._estimator_type = "classifier"

estimators= [('XG', XG_clf), ('NN', NN_clf) ]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), stack_method='predict_proba')

clf.fit(X_train_trf1, y_train)
print("Stacking model score: %.3f" % clf.score(X_test_trf1, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch



Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch



Stacking model score: 0.799


In [None]:
predictions = clf.predict(X_test_trf1)
predictions



array([0, 1, 1, ..., 1, 1, 1])

In [None]:
predictions = clf.predict(X_test_trf1)

X_test_complete['prediction'] = predictions

X_test_complete = X_test_complete[X_test_complete.prediction != 0]



In [None]:
X_test_complete

Unnamed: 0,primaryTitle,userId_ori,prediction
1,True Grit,1920,1
2,Secretary,159816,1
3,Gravity,30643,1
6,Toy Story,74429,1
13,Out of Africa,2177,1
...,...,...,...
68426,Superman,2177,1
68427,500 Days of Summer,39905,1
68430,Cape Fear,103611,1
68431,Whiplash,94154,1


In [None]:
def recommend(userid):
  recommend = []
  for i in range(0, X_test_complete.shape[0] - 1):
    if X_test_complete.iloc[i].userId_ori == userid:
      recommend.append(X_test_complete.iloc[i].primaryTitle)
  return pd.DataFrame(data=recommend, columns = ['Recommended_Movies'])

In [None]:
#return recommendation list for userId 1920
recommend(1920)

Unnamed: 0,Recommended_Movies
0,True Grit
1,Contact
2,Roman Holiday
3,Mary Shelley's Frankenstein
4,Pinocchio
5,Blade Runner 2049
6,Peter Pan
7,Mary Poppins
8,Black Panther
9,Vertigo
