# Latent Factor Collaborative Filtering using Deep Learning

In [166]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Load the packages 
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, Reshape,dot,Input,Dense
from keras.models import Sequential,Model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt

**Define the Required Functions**

In [172]:
data_dir = Path('/home/santanu/ML_DS_Catalog-/Collaborating Filtering/ml-100k/')
outdir = Path('/home/santanu/ML_DS_Catalog-/Collaborating Filtering/ml-100k/')

#Function to read data 
def create_data(rating,header_cols):
    data = pd.read_csv(rating,header=None,sep='\t')
    #print(data)
    data.columns = header_cols
    return data
 
#Movie id to movie name dict 
def create_movie_dict(movie_file):
    print(movie_file)
    df = pd.read_csv(movie_file,sep='|', encoding='latin-1',header=None)
    movie_dict = {}
    movie_ids = list(df[0].values)
    movie_name = list(df[1].values)
    for k,v in zip(movie_ids,movie_name):
        movie_dict[k] = v 
    return movie_dict
# Function to create training validation and test data
def train_val(df,val_frac=None):
    X,y = df[['userID','movieID']].values,df['rating'].values
    #Offset the ids by 1 for the ids to start from zero
    X = X - 1 
    if val_frac != None:
        X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=val_frac,random_state=0)
        return X_train, X_val, y_train, y_val
    else:
        return X,y

#  Define Model
# Converts the userId and the MovieId to embedding sizes of a given latent dimension and then takes the dot product 
# of those embeddings to get the rating score. 
def model(max_users,max_movies,latent_factors):
    user_ID = Input(shape=(1,))
    movie_ID = Input(shape=(1,))
    x = Embedding(max_users,latent_factors, input_length=1)(user_ID)
    print(x.shape)
    y = Embedding(max_movies,latent_factors, input_length=1)(movie_ID)
    print(y.shape)
    out = dot([x,y],axes=2, normalize=False)
    out= Reshape((1,))(out)
    model = Model(inputs=[user_ID,movie_ID],outputs=out)
    print(model.summary())
    return model
    
    

**Data Processing and Model Training**

In [173]:
#Data Processing and Model Training 

train_ratings_df = create_data(f'{data_dir}/u1.base',['userID','movieID','rating','timestamp']) 
test_ratings_df = create_data(f'{data_dir}/u1.test',['userID','movieID','rating','timestamp']) 
X_train, X_val,y_train, y_val = train_val(train_ratings_df,val_frac=0.2)
movie_dict = create_movie_dict(f'{data_dir}/u.item')
num_users = len(train_ratings_df['userID'].unique())
num_movies = len(train_ratings_df['movieID'].unique())

print(f'Number of users {num_users}')
print(f'Number of movies {num_movies}')
model = model(num_users,num_movies,40)
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
model.compile(loss='mse',optimizer='adam')
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint(f'{outdir}/nn_factor_model.h5', save_best_only=True)]
model.fit([X_train[:,0],X_train[:,1]], y_train, nb_epoch=30, validation_data=([X_val[:,0],X_val[:,1]], y_val), verbose=2, callbacks=callbacks)








/home/santanu/ML_DS_Catalog-/Collaborating Filtering/ml-100k/u.item
Number of users 943
Number of movies 1650
(?, 1, 40)
(?, 1, 40)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_61 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_62 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_61 (Embedding)        (None, 1, 40)        37720       input_61[0][0]                   
__________________________________________________________________________________________________
embedding_62 (Embedding)        (None, 1, 40)        66000       input_62[0]



Train on 64000 samples, validate on 16000 samples
Epoch 1/30
 - 4s - loss: 8.8970 - val_loss: 2.0422
Epoch 2/30
 - 3s - loss: 1.3345 - val_loss: 1.0734
Epoch 3/30
 - 3s - loss: 0.9656 - val_loss: 0.9704
Epoch 4/30
 - 3s - loss: 0.8921 - val_loss: 0.9317
Epoch 5/30
 - 3s - loss: 0.8452 - val_loss: 0.9097
Epoch 6/30
 - 3s - loss: 0.8076 - val_loss: 0.8987
Epoch 7/30
 - 3s - loss: 0.7686 - val_loss: 0.8872
Epoch 8/30
 - 3s - loss: 0.7260 - val_loss: 0.8920
Epoch 9/30
 - 3s - loss: 0.6842 - val_loss: 0.8959


<keras.callbacks.History at 0x7f8b3fbe06a0>

**Evaluate on the test dataset**

In [176]:
#Evaluate on the test dataset 
from keras.models import load_model
model = load_model(f'{outdir}/nn_factor_model.h5')
X_test,y_test = train_val(test_ratings_df,val_frac=None)
pred = model.predict([X_test[:,0],X_test[:,1]])[:,0]
print('Hold out test set RMSE:',(np.mean((pred - y_test)**2)**0.5))
pred = np.round(pred)
test_ratings_df['predictions'] = pred
test_ratings_df['movie_name'] = test_ratings_df['movieID'].apply(lambda x:movie_dict[x])


Hold out test set RMSE: 0.9545832684051578


**Check Evaluation results for the UserID = 1  **

In [177]:
#Check evaluation results for the UserID = 1 
test_ratings_df[test_ratings_df['userID'] == 1].sort_values(['rating','predictions'],ascending=False)

Unnamed: 0,userID,movieID,rating,timestamp,predictions,movie_name
2,1,12,5,878542960,5.0,"Usual Suspects, The (1995)"
20,1,60,5,875072370,5.0,Three Colors: Blue (1993)
23,1,64,5,875072404,5.0,"Shawshank Redemption, The (1994)"
45,1,100,5,878543541,5.0,Fargo (1996)
53,1,114,5,875072173,5.0,Wallace & Gromit: The Best of Aardman Animatio...
78,1,170,5,876892856,5.0,Cinema Paradiso (1988)
79,1,171,5,889751711,5.0,Delicatessen (1991)
80,1,174,5,875073198,5.0,Raiders of the Lost Ark (1981)
84,1,183,5,875072262,5.0,Alien (1979)
90,1,190,5,875072125,5.0,Henry V (1989)


**Check UserID = 1 movie ratings in train dataset**

In [144]:
#Check UserID = 1 movie ratings in train dataset
train_ratings_df['movie_name'] = train_ratings_df['movieID'].apply(lambda x:movie_dict[x])
train_ratings_df[train_ratings_df['userID'] == 1].sort_values(['rating'],ascending=False)

Unnamed: 0,userID,movieID,rating,timestamp,movie_name
0,1,1,5,874965758,Toy Story (1995)
62,1,119,5,876893098,Maya Lin: A Strong Clear Vision (1994)
89,1,168,5,874965478,Monty Python and the Holy Grail (1974)
87,1,166,5,874965677,Manon of the Spring (Manon des sources) (1986)
86,1,165,5,874965518,Jean de Florette (1986)
81,1,152,5,878542589,Sleeper (1973)
72,1,137,5,875071541,Big Night (1996)
65,1,124,5,875071484,Lone Star (1996)
60,1,115,5,878541637,"Haunted World of Edward D. Wood Jr., The (1995)"
91,1,172,5,874965478,"Empire Strikes Back, The (1980)"
