<a href="https://colab.research.google.com/github/Pari-singh/AmazonBeautyProduct_RecommenderSystem/blob/master/Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Collaborative Filtering based recommendation

Estimated time taken : 3 hrs approx (includes model running and rerunning time on GPU K80, personal study about the topic)

In [0]:
from google.colab import drive
drive.mount('/content/drive')
cd drive/My\ Drive/Colab\ Notebooks

### Importing required files and Libraries 

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Dense, Flatten, dot, concatenate, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.callbacks import ModelCheckpoint

import pandas as pd
import gzip
import nltk, re
import string
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

### Accessing the data

In [0]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

ratings = getDF('reviews_Beauty_5.json.gz')
ratings.rename(columns={'reviewerID': 'user_id', 
                        'asin': 'item_id', 
                        'reviewerName': 'user_name', 
                        'reviewText': 'review_text',
                        'summary': 'review_summary',
                        'overall': 'score'},
               inplace=True)

ratings['user_emb_id'] = ratings.user_id.astype('category').cat.codes.values
ratings['item_emb_id'] = ratings.item_id.astype('category').cat.codes.values

In [58]:
ratings.sample(5)

Unnamed: 0,user_id,item_id,user_name,helpful,review_text,score,review_summary,unixReviewTime,reviewTime,user_emb_id,item_emb_id
19669,A12FDPTQO9FVOZ,B000FH4T3M,Leelee Sheed,"[0, 1]",The Neutrogena cleared up my acne tremendously...,5.0,Neutrogena treatment pads cleared up my acne,1380499200,"09 30, 2013",403,1281
62994,A321S4J213I7E4,B001OVQSJG,Shanena,"[0, 2]","I had issues with my scalp, it was actually &#...",4.0,summary,1353888000,"11 26, 2012",12185,4276
79124,A2QAO2M9U9OH9,B002UEF3OM,awilda sotomayor,"[0, 0]",Lite Minty Green pastel color. First coat (str...,4.0,Essie Mint Candy Apple,1391385600,"02 3, 2014",10268,5314
73382,AX51LQ35FS86N,B002L6M068,Priscilla Mendez,"[0, 0]",Love it! Maybelline is amazing and all its pro...,5.0,Show me the Bronzers!!,1388620800,"01 2, 2014",21908,5028
185327,A2S4TW2PR2FTWO,B00FEMQUKS,CCReviews,"[0, 0]","I can't say that I &#34;love it&#34; yet, beca...",4.0,"No tingling, nice minty smell.",1404777600,"07 8, 2014",10563,11601


In [12]:
len(ratings['user_emb_id']), len(ratings['user_emb_id'].unique()), len(ratings['item_emb_id']), len(ratings['item_emb_id'].unique())

(198502, 22363, 198502, 12101)

### Splitting and sorting the data

In [0]:
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=0)

user_id_train = ratings_train['user_emb_id']
item_id_train = ratings_train['item_emb_id']
score_train = ratings_train['score']

user_id_test = ratings_test['user_emb_id']
item_id_test = ratings_test['item_emb_id']
score_test = ratings_test['score']


max_user_id = ratings['user_emb_id'].max()
max_item_id = ratings['item_emb_id'].max()

user_id_input = Input(shape=[1], name='user')
item_id_input = Input(shape=[1], name='item')

### The first basic model for Collaborative Filtering

In [0]:
user_embedding = Embedding(output_dim=100, input_dim= max_user_id + 1,
                           input_length=1, name='user_embedding')(user_id_input)
item_embedding = Embedding(output_dim=100, input_dim=max_item_id + 1,
                           input_length=1, name='item_embedding')(item_id_input)


user_vecs = Flatten()(user_embedding)
item_vecs = Flatten()(item_embedding)

y = dot([user_vecs, item_vecs], axes=1)

model = Model(inputs=[user_id_input, item_id_input], outputs=[y])
model.compile(optimizer='adam', metrics=['accuracy'], loss = 'mae')

In [18]:
model.fit([user_id_train, item_id_train], score_train, batch_size=64, epochs=10, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 127040 samples, validate on 31761 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc794320ac8>

### Testing the error on test data

In [20]:
from sklearn.metrics import mean_absolute_error
predicted = model.predict([user_id_test, item_id_test]).squeeze() 
mae = mean_absolute_error(predicted, score_test)
print("Final test MAE: %0.3f" % mae)

Final test MAE: 2.185


### Modified Model : Adding Dense Layer with dropout

In [0]:
def CF(Emb_size):
  user_id_input = Input(shape=[1], name='user')
  item_id_input = Input(shape=[1], name='item')
  
  user_embedding = Embedding(output_dim=10, input_dim= max_user_id + 1,
                             input_length=1, name='user_embedding')(user_id_input)
  item_embedding = Embedding(output_dim=10, input_dim=max_item_id + 1,
                             input_length=1, name='item_embedding')(item_id_input)
  
  user_vecs = Flatten()(user_embedding)
  item_vecs = Flatten()(item_embedding)
  
  
  input_vecs = concatenate([user_vecs, item_vecs])
  input_vecs = Dropout(0.5)(input_vecs)
  
  x = Dense(64, activation='relu')(input_vecs)
  
  y = Dense(1)(x)
  
  return Model(inputs=[user_id_input, item_id_input], outputs=[y])



## For Embedding size = 10
model2 = CF(10)

## For Embedding size = 50
model3 = CF(50)

## For Embedding size = 100
model4 = CF(100)


### For Embedding size = 10, MAE = 0.761

In [53]:
model2.fit([user_id_train, item_id_train], score_train, batch_size=64, epochs=5, validation_split=0.2)
from sklearn.metrics import mean_absolute_error
predicted = model2.predict([user_id_test, item_id_test]).squeeze() 
mae = mean_absolute_error(predicted, score_test)
print("Final test MAE: %0.3f" % mae)

## Validation accuracy was decreasing with higher epochs, seeing increase in training accuracy. This clearly implies overfitting due to retrainingon same sample data

Train on 127040 samples, validate on 31761 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final test MAE: 0.761


### For Embedding size = 50, MAE = 0.789

In [55]:
model3.fit([user_id_train, item_id_train], score_train, batch_size=64, epochs=5, validation_split=0.2)
predicted = model3.predict([user_id_test, item_id_test]).squeeze() 
mae = mean_absolute_error(predicted, score_test)
print("Final test MAE: %0.3f" % mae)

Train on 127040 samples, validate on 31761 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final test MAE: 0.789


### For Embedding size = 100, MAE= 0.795

In [57]:
model4.fit([user_id_train, item_id_train], score_train, batch_size=64, epochs=5, validation_split=0.2)
predicted = model4.predict([user_id_test, item_id_test]).squeeze() 
mae = mean_absolute_error(predicted, score_test)
print("Final test MAE: %0.3f" % mae)

Train on 127040 samples, validate on 31761 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final test MAE: 0.795


 For our dataset and the above created model, the best Embedding Layer size seem to be 10, having least MAE (Mean Absolute Score) and highest validation accuracy. Thought the perfect way of measuring any model would be calculating its F1 score, which gives the complete background of what is happening, but for comparing the hyperparameter with such minute difference, accuracy could also be used

## Neural Based Collaborative Filtering

Reference   WWW 2017 paper entitled: Neural Collaborative Filtering

In [0]:
import keras
n_latent_factors_user = 8
n_latent_factors_item = 10
n_latent_factors_mf = 3
n_users, n_items = len(ratings['user_emb_id'].unique()), len(ratings['item_emb_id'].unique())

item_input = keras.layers.Input(shape=[1],name='Item')
item_embedding_mlp = keras.layers.Embedding(n_items + 1, n_latent_factors_item, name='Item-Embedding-MLP')(item_input)
item_vec_mlp = keras.layers.Flatten(name='FlattenItem-MLP')(item_embedding_mlp)
item_vec_mlp = keras.layers.Dropout(0.2)(item_vec_mlp)

item_embedding_mf = keras.layers.Embedding(n_items + 1, n_latent_factors_mf, name='Item-Embedding-MF')(item_input)
item_vec_mf = keras.layers.Flatten(name='FlattenItem-MF')(item_embedding_mf)
item_vec_mf = keras.layers.Dropout(0.2)(item_vec_mf)


user_input = keras.layers.Input(shape=[1],name='User')
user_vec_mlp = keras.layers.Flatten(name='FlattenUsers-MLP')(keras.layers.Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding-MLP')(user_input))
user_vec_mlp = keras.layers.Dropout(0.2)(user_vec_mlp)

user_vec_mf = keras.layers.Flatten(name='FlattenUsers-MF')(keras.layers.Embedding(n_users + 1, n_latent_factors_mf,name='User-Embedding-MF')(user_input))
user_vec_mf = keras.layers.Dropout(0.2)(user_vec_mf)


In [0]:
concat = keras.layers.merge.concatenate([item_vec_mlp, user_vec_mlp])
concat_dropout = keras.layers.Dropout(0.2)(concat)
dense = keras.layers.Dense(200,name='FullyConnected')(concat_dropout)
dense_batch = keras.layers.BatchNormalization(name='Batch')(dense)
dropout_1 = keras.layers.Dropout(0.2,name='Dropout-1')(dense_batch)
dense_2 = keras.layers.Dense(100,name='FullyConnected-1')(dropout_1)
dense_batch_2 = keras.layers.BatchNormalization(name='Batch-2')(dense_2)


dropout_2 = keras.layers.Dropout(0.2,name='Dropout-2')(dense_batch_2)
dense_3 = keras.layers.Dense(50,name='FullyConnected-2')(dropout_2)
dense_4 = keras.layers.Dense(20,name='FullyConnected-3', activation='relu')(dense_3)

pred_mf = keras.layers.dot([item_vec_mf, user_vec_mf],axes=1, normalize=False)


pred_mlp = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)

combine_mlp_mf = keras.layers.merge.concatenate([pred_mf, pred_mlp])
result_combine = keras.layers.Dense(100,name='Combine-MF-MLP')(combine_mlp_mf)
deep_combine = keras.layers.Dense(100,name='FullyConnected-4')(result_combine)


result = keras.layers.Dense(1,name='Prediction')(deep_combine)

In [38]:
model = keras.Model([user_input, item_input], result)
opt = keras.optimizers.Adam(lr =0.01)
model.compile(optimizer='adam',loss= 'mean_absolute_error', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Item-Embedding-MLP (Embedding)  (None, 1, 10)        121020      Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding-MLP (Embedding)  (None, 1, 8)         178912      User[0][0]                       
__________________________________________________________________________________________________
FlattenIte

In [39]:
model.fit([user_id_train, item_id_train], score_train, epochs=5, verbose=0, validation_split=0.2)

<keras.callbacks.History at 0x7fc79bcb85c0>

In [47]:
from sklearn.metrics import mean_absolute_error
y_hat_2 = np.round(model.predict([user_id_train, item_id_train]),0)
print(mean_absolute_error(score_train, y_hat_2))

print(mean_absolute_error(score_test, model.predict([user_id_test, item_id_test])))

0.5342472654454317
0.9073115254027403


The MAE score has infact increased in this case. Little tuning might be required to generate better result for Neural Network. Nonetheless, this gives us the baseline and shows that our simple model does not perform too bad. 