In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from tensorflow.keras import layers
from tensorflow import keras
from keras.constraints import maxnorm
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation,  Flatten, Input
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import metrics
from keras.metrics import AUC
from keras.metrics import Precision
from keras.metrics import Recall

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import sklearn
from sklearn.ensemble import StackingClassifier
import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing

In [4]:
X_train_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_train_trf1_v3.csv')
X_val_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_val_trf1_v3.csv')
X_test_trf1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_test_trf1_v3.csv')

#Drop rating_month and rating_year columns
#Drop primaryTitle, userId_ori, movieId_ori

extracted_cols = X_test_trf1.loc[:,['primaryTitle', 'movieId_ori','userId_ori']]

def dropRatingDate(df):
  cols = [0, 1, 2, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 42]
  return df.drop(df.columns[cols], axis=1)

X_train_trf1 = dropRatingDate(X_train_trf1)
X_test_trf1 = dropRatingDate(X_test_trf1)
X_val_trf1 = dropRatingDate(X_val_trf1)

columns_list = X_train_trf1.columns.tolist()

X_train_trf1 = X_train_trf1.to_numpy()
X_val_trf1 = X_val_trf1.to_numpy()
X_test_trf1 = X_test_trf1.to_numpy() 

y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_train_xg_v3.csv').to_numpy(dtype = 'int')
y_val= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_val_xg_v3.csv').to_numpy(dtype = 'int')
y_test= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_test_xg_v3.csv').to_numpy(dtype = 'int')
y_test_ori = y_test

In [5]:
#Convert 9bins to 2bins
def nineToTwoBins(y_dataset):
    for i in range(0, len(y_dataset)): 
        if y_dataset[i] < 6:
            y_dataset[i] = 0
        else:
            y_dataset[i] = 1
    return y_dataset

In [6]:
nineToTwoBins(y_train)
nineToTwoBins(y_val)
nineToTwoBins(y_test)

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [7]:
training_data = {'X_train':X_train_trf1,'Y_train':y_train,
                'X_val': X_val_trf1,'Y_val':y_val,
                'X_test': X_test_trf1 ,'Y_test':y_test}

# Stacking and Loading Optimized Models

**Build Model Function based on Bayesian Optimized Model**

In [8]:
def build_model():
  # create model
  model = Sequential()

  # Define input layer, and first hidden layer
  # neurons for first hidden layer
  model.add(Dense(970, input_dim=27, activation='relu')
  )

  # Define 2nd hidden (Dense) layers onward
  # Tune layers, neurons
  for i, neuron in enumerate([970, 970, 20, 970, 970,
                              970, 970, 970, 970, 970,
                              970, 970], start = 2):
    model.add(Dense(units=neuron,
                    activation='relu', 
                    name=f'Hidden{i}'))
    # Add drop out layers to 5th hidden layer
    if i == 4:
      model.add(Dropout(rate=0.25)
      )

  # Define output layer
  model.add(Dense(1, activation='sigmoid')
  )

  # Compile model
  # Tune learning rate at optimizer
  model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=[AUC(),Precision(), Recall()]
                  )
  return model

**XGBClassifier Based on GridSearch Tuned Model**

In [9]:
XG_clf = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=9,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.9,
                    colsample_bytree=0.6,
                    reg_alpha = 0.01,
                    tree_method = "gpu_hist",
                    objective='multi:softprob',
                    num_class=2,
                    seed=27)

XG_clf._estimator_type = "classifier"

NN_clf = KerasClassifier(build_fn=build_model, 
                          batch_size = 500,
                          epochs = 80)
NN_clf._estimator_type = "classifier"

estimators= [('XG', XG_clf), ('NN', NN_clf) ]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), stack_method='predict_proba')

clf.fit(X_train_trf1, y_train)
print("Stacking model score: %.3f" % clf.score(X_test_trf1, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch



Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch



Stacking model score: 0.768


In [10]:
predictions = clf.predict_proba(X_test_trf1)
predictions



array([[0.50729922, 0.49270078],
       [0.1136179 , 0.8863821 ],
       [0.10245796, 0.89754204],
       ...,
       [0.1093007 , 0.8906993 ],
       [0.15384866, 0.84615134],
       [0.29662749, 0.70337251]])

# Metric

**Hit Ratio**

In [11]:
def hit_ratio(k, extracted_cols, y_test_ori, predictions):

  X_test_complete = extracted_cols
  X_test_complete['prediction'] = predictions
  d = dict(tuple(X_test_complete.groupby(['userId_ori'])))

  Y_test_complete = extracted_cols
  Y_test_complete['actual_rating'] = y_test_ori
  d_true = dict(tuple(Y_test_complete.groupby(['userId_ori'])))

  ratio = []
  for userId in d:
    topk_True = d_true[userId].sort_values(['actual_rating'], ascending = False)[:k]['movieId_ori'].values.tolist()
    topk_pred = d[userId].sort_values(['prediction'], ascending = False)[:k]['movieId_ori'].values.tolist()
    ratio.append(len([x for x in topk_pred if x in topk_True])/k)
  
  #return mean hit ratio
  return pd.Series(ratio).mean()

In [12]:
#input values of k
k = [1, 5, 10, 20, 50, 100]

for i in k:
  print('Hit Ratio @'+ str(i) +' is ' + str(hit_ratio(i, extracted_cols, y_test_ori, clf.predict_proba(training_data['X_test'])[:, 1:])))



Hit Ratio @1 is 0.0




Hit Ratio @5 is 0.03723404255319151




Hit Ratio @10 is 0.07765957446808502




Hit Ratio @20 is 0.13962765957446815




Hit Ratio @50 is 0.3032978723404254




Hit Ratio @100 is 0.5021276595744679


**NDCG**

In [13]:
def discountedCumulativeGain(result):
  dcg = []
  for idx, val in enumerate(result): 
      numerator = val
      # add 2 because python 0-index
      denominator =  np.log2(idx + 2) 
      score = numerator/denominator
      dcg.append(score)
  return sum(dcg)

def normalizedDiscountedCumulativeGain(k, result): 
  sorted_result = []
  for i in range(0, k):
    sorted_result.append(1)
  dcg = discountedCumulativeGain(result)
  idcg = discountedCumulativeGain(sorted_result)
  ndcg = dcg / idcg
  return ndcg

def overallNDCG(k, extracted_cols, y_test_ori, predictions):
  X_test_complete = extracted_cols
  X_test_complete['prediction'] = predictions
  d = dict(tuple(X_test_complete.groupby(['userId_ori'])))

  Y_test_complete = extracted_cols
  Y_test_complete['actual_rating'] = y_test_ori
  d_true = dict(tuple(Y_test_complete.groupby(['userId_ori']))) 

  ndcg_lst = []
  for userId in d:
    topk_True = d_true[userId].sort_values(['actual_rating'], ascending = False)[:k]['movieId_ori'].values.tolist()
    topk_pred = d[userId].sort_values(['prediction'], ascending = False)[:k]['movieId_ori'].values.tolist()
    result = []
    for i in range(0, len(topk_pred)):
      if topk_pred[i] in topk_True:
        result.append(1)
      else:
        result.append(0)
    
    ndcg_lst.append(normalizedDiscountedCumulativeGain(k, result))

  return pd.Series(ndcg_lst).mean()

In [14]:
#input values of k
k = [1, 5, 10, 20, 50, 100]

for i in k:
  print('NDCG @'+ str(i) +' is ' + str(overallNDCG(i, extracted_cols, y_test_ori, clf.predict_proba(training_data['X_test'])[:, 1:])))



NDCG @1 is 0.0




NDCG @5 is 0.03661422627619494




NDCG @10 is 0.07803478339236052




NDCG @20 is 0.1431118432930474




NDCG @50 is 0.31651288737920746




NDCG @100 is 0.5225383477414228


# Output

In [16]:
output = extracted_cols
predictions = clf.predict_proba(training_data['X_test'])[:, 1:]
output['prediction'] = predictions
d = dict(tuple(output.groupby(['userId_ori'])))

def recommendTop10(userId):
  return d[userId].sort_values(['prediction'], ascending = False)['primaryTitle'][:10].reset_index(drop=True)



In [17]:
#Top 10 recommendations for user 1920
recommendTop10(1920)

0                         Braveheart
1                       Forrest Gump
2                   Some Like It Hot
3                          Pinocchio
4    Monty Python and the Holy Grail
5                              Split
6                           Sideways
7                            Amadeus
8                           Superman
9                          Gladiator
Name: primaryTitle, dtype: object