<a href="https://colab.research.google.com/github/VanHoann/Yelp_Dataset_Challenges/blob/main/Recommendation_Prediction/Final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from sklearn.metrics import mean_squared_error
import random

# Setting random seeds to replicate results easily
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

# Root Mean Squared Error (RMSE)

We need a reliable way to evaluate the performance of recommendation algorithms. RMSE is one of the popular metrics to estimate how good the recommendation algorithm is. Since RMSE is measuring the prediction errors, the smaller error that the model can achieve, the better performance it is, and vice versa.

$$RMSE=\sqrt{\sum_{i=1}^n\frac{(\hat{y}_i - y_i)^2}{N}}$$

$\hat{y}_i$: The predicted answer of sample $i$

$y$: The ground truth answer of sample $i$

Data analysis: Plot the number of each that give review

In [None]:
def rmse(pred, actual):
    '''
    params:
        pred <np.array>: an array containing all predicted ratings
        actual <np.array>: an array containing all ground truth ratings

    return:
        a scalar whose value is the rmse
    '''
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

# Neural Collaborative Filtering (NCF) Model Implementation

Here we implement two instantiations of NCF model. 

The first instantiation computes the recommendation score (e.g., ratings) between a pair of user and item using dot product of their embeddings, which is equivalent to matrix factorization model for recommendation.

The second instantiation concatenates the user's and item's embeddings, then feed the the concatenated vector into a MLP to calculate the recommendation score. Adoption of MLP equips the model with high flexibility and non-linearity to effectively learn the interaction between user and item latent features.

In [None]:
def build_ncf_model(n_users, n_items, embed_size, output_layer='dot',params=0):
    '''
    params:
        n_users <int>: The number of user embedding vectors
        n_items <int>: The number of item embedding vectors
        embed_size <int>: The dimension of each embedding vector
        output_layer <str>: Indicates the instantiation of NCF to use, available options are either 'dot' or 'mlp'

    return:
        a keras Model object for the constructed ncf model 
    '''
    # Get the users and items input
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')


    # Get the embeddings of users and items
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)


    if output_layer == 'dot':
        # Compute the dot product of users' and items' embeddings as the model output
        model_output = Dot(axes=1)([user_emb, item_emb])

    elif output_layer == 'mlp':
        # Concatenate the users' and items' embeddings as the input of MLP
        mlp_input = Concatenate()([user_emb, item_emb])

        # First fully-connected layer
        dense_1 = Dense(128, activation='relu')(mlp_input)
        dense_1_dp = Dropout(0.15)(dense_1)

        # Second fully-connected layer
        dense_2 = Dense(64, activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(0.15)(dense_2)

        # Final fully-connected layer to compute model output
        model_output = Dense(1)(dense_2_dp)
    elif output_layer == 'cus':
        # Concatenate the users' and items' embeddings as the input of MLP
        mlp_input = Concatenate()([user_emb, item_emb])

        # First fully-connected layer
        dense_1 = Dense(params[0], activation='relu')(mlp_input)
        dense_1_dp = Dropout(params[1])(dense_1)

        # Second fully-connected layer
        dense_2 = Dense(params[2], activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(params[3])(dense_2)

        # Final fully-connected layer to compute model output
        model_output = Dense(params[4])(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input], outputs=model_output)
    return model

# Ratings Prediction

### Loading training and validation rating table

In [None]:
data = "https://raw.githubusercontent.com/VanHoann/Yelp_Dataset_Challenges/main/Recommendation_Prediction/data"
tr_df = pd.read_csv(f"{data}/train.csv")
val_df = pd.read_csv(f"{data}/valid.csv")
test_df = pd.read_csv(f"{data}/test.csv")

### Building two dictionaries to map original user ids and item ids into corresponding indices in respective embedding matrices

In [None]:
# Get the unique set of all user ids and set of all business ids in train set
user_set = set(tr_df.user_id.unique()).union(set(test_df.user_id.unique()))
business_set = set(tr_df.business_id.unique()).union(set(test_df.business_id.unique()))

# Build user vocabulary
user_vocab = dict(zip(user_set, range(1, len(user_set) + 1)))

# Reserve the first row of the embedding matrix for users unseen in the training set
user_vocab['unk'] = 0 
n_users = len(user_vocab)

# Build business vocabulary
business_vocab = dict(zip(business_set, range(1, len(business_set) + 1)))
# Reserve the first row of the embedding matrix for businesses unseen in the training set
business_vocab['unk'] = 0
n_items = len(business_vocab)

In [None]:
print(list(user_vocab.items())[:2])
print(list(business_vocab.items())[:2])

[('963c74473df680c5293cf505d2044e09', 1), ('56855fadbba2aebf832cc2dc3674c9d0', 2)]
[('6ab42de44090919368e967d5cad01e47', 1), ('5d756dbba7c6902ea27b838e087621e6', 2)]


### Replacing the original user and item ids in train and valdiation set with indices in embedding matrices

In [None]:
# Transforming user_id into a number by the user_vocab dictionary, and
# transforming business_id into a number by the business_vocab dictonary
tr_users = tr_df.user_id.apply(lambda x: user_vocab[x]).values
tr_items = tr_df.business_id.apply(lambda x: business_vocab[x]).values
val_users = val_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
val_items = val_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values
test_users = test_df.user_id.apply(lambda x: user_vocab[x]).values
test_items = test_df.business_id.apply(lambda x: business_vocab[x]).values

### Retrieving ratings in the training and validation set

In [None]:
tr_ratings = tr_df.stars.values
val_ratings = val_df.stars.values

### Building the NCF model defined above

In [None]:
model = build_ncf_model(n_users, n_items, embed_size=50, output_layer='mlp')

### Training the model using Adam optimizer and mean squared error loss

In [None]:
import os
os.mkdir("models")

In [None]:
model.compile(optimizer='adam', loss='mse')

history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=1, 
        verbose=1,
        callbacks=[ModelCheckpoint('models/model.h5')])



### Tuning the model

In [None]:
model = tf.keras.models.load_model('models/model.h5')
y_pred = model.predict([tr_users, tr_items])
print("Train set RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("Validation set RMSE: ", rmse(y_pred, val_ratings))

Train set RMSE:  0.955914160752847
Validation set RMSE:  1.0640883301074038


In [None]:
best = rmse(y_pred, val_ratings)

In [None]:
for i1 in [5,6]:
    for i2 in range(12,15):
      for i3 in [7,8]:
        for i4 in range(10,12):
          model = build_ncf_model(n_users, n_items, embed_size=50, output_layer='cus',params=[2**i1,0.1+0.01*i2,2**i3,0.1+0.01*i4,1])
          model.compile(optimizer='adam', loss='mse')

          history = model.fit(
                  [tr_users, tr_items], 
                  tr_ratings, 
                  epochs=1, 
                  verbose=0,
                  callbacks=[ModelCheckpoint('models/model.h5')])
          model = tf.keras.models.load_model('models/model.h5')
          #print(str(i1)+" "+str(i2)+ " "+str(i3)+ " "+str(i4))
          y_pred = model.predict([tr_users, tr_items])
          #print("Train set RMSE: ", rmse(y_pred, tr_ratings))
          y_pred = model.predict([val_users, val_items])
          #print("Validation set RMSE: ", rmse(y_pred, val_ratings))
          if rmse(y_pred,val_ratings) < best:
            best = rmse(y_pred,val_ratings)
            print("Update current best with value",rmse(y_pred,val_ratings))
            print("Best params is "+str(i1)+" "+str(i2)+" "+str(i3)+" "+str(i4))

Update current best with value 1.0628255374216928
Best params is 5 12 8 10
Update current best with value 1.0623724321992232
Best params is 5 12 8 11
Update current best with value 1.0619134493239775
Best params is 5 13 8 10
Update current best with value 1.0615142402393105
Best params is 5 14 8 10
Update current best with value 1.0605953145581708
Best params is 6 13 8 10


# Prediction

evaluate.py

In [None]:
model = build_ncf_model(n_users, n_items, embed_size=50, output_layer='cus',params=[32,0.24,256,0.2,1])
model.compile(optimizer='adam', loss='mse')

history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=1, 
        verbose=0,
        callbacks=[ModelCheckpoint('models/model.h5')])
model = tf.keras.models.load_model('models/model.h5')
#print(str(i1)+" "+str(i2)+ " "+str(i3)+ " "+str(i4))
tr_pred = model.predict([tr_users, tr_items])
print("Train set RMSE: ", rmse(tr_pred, tr_ratings))
val_pred = model.predict([val_users, val_items])
print("Validation set RMSE: ", rmse(val_pred, val_ratings))
test_pred = model.predict([test_users,test_items])

Train set RMSE:  0.9761562416079695
Validation set RMSE:  1.0596320973598592


In [None]:
val = val_df
val["stars"] = val_pred
val.to_csv("val_pred.csv", index=False)

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
	# Ignore nonzero terms.
	pred = pred[actual.nonzero()].flatten()
	actual = actual[actual.nonzero()].flatten()
	return sqrt(mean_squared_error(pred, actual))

val_df = pd.read_csv(f"{data}/valid.csv")

pred_df = pd.read_csv('val_pred.csv')

df = pd.merge(val_df, pred_df, how="left", left_on=['user_id',
		'business_id'], right_on = ['user_id','business_id'])

df.fillna(0, inplace=True)


print("VALIDATION RMSE: ", rmse(df['stars_y'].values, df['stars_x'].values))

VALIDATION RMSE:  1.0596320974266384


In [None]:
tr_df['business_id'].value_counts()

a872871042c5c46838a1ba924dab1f6c    106
fd37fd10d66b673df363b646a26c9ab3     94
a3e891f8e82805513255e0e0fb1dafa8     86
918191ea9d67f9f664a111dbce971237     77
f441407637e366ac9dbadc249f6241bb     77
                                   ... 
95c1bd5fd7e26428382a82c2a547d852      1
c657ef76ba2c26d47409221ae13f8b49      1
ddf68eee71d5772a0840911c4b3eb083      1
e8e838c80af3f889a789b20745531851      1
493dfe7c8119d73bd1ecadd361094682      1
Name: business_id, Length: 5938, dtype: int64