# NCF Recommendation System

Modified from https://github.com/mdipietro09/DataScience_ArtificialIntelligence_Utils/blob/master/machine_learning/example_recommendation.ipynb

### 0 - Setup

###### Import pckgs

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, preprocessing
from tensorflow.keras import metrics, models, layers, utils  #(2.6.0)

###### Import data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
NY_reviews = pd.read_csv('/content/drive/MyDrive/DS4A Team4/New_York_reviews_cleaned.csv', index_col=0)

###### Partitioning

In [None]:
#check and aggregate the multiple reviews by one author for a single restaurant
grouped = NY_reviews.groupby(['author_id', 'restaurant_name'])

review_counts = grouped.size().reset_index(name='review_count')

review_avg = grouped['rating_review'].mean().reset_index(name='review_avg')

review_date = grouped['date'].max().reset_index(name='review_date')

review_counts['avg_rating'] = review_avg['review_avg']

review_counts['date'] = review_date['review_date']

multi_review_list = review_counts.sort_values(by='date', ascending=False)

In [None]:
dtf_users = multi_review_list.pivot(index = 'author_id', columns = 'restaurant_name', values = 'avg_rating')
restaurant_names = dtf_users.columns
user_names = dtf_users.index
dtf_users = dtf_users.fillna(0).values

In [None]:
MIN_USER_RATINGS = 3

def train_test_split(ratings):
  validation = np.zeros(ratings.shape)
  train = ratings.copy()
  for user in np.arange(ratings.shape[0]):
    if len(ratings[user,:].nonzero()[0]) >= MIN_USER_RATINGS:
      rest_1, rest_2 = (multi_review_list[multi_review_list['author_id']==user_names[user]])['restaurant_name'][:2]
      val_ratings = [np.where(restaurant_names==rest_1)[0][0],np.where(restaurant_names==rest_2)[0][0]]
      train[user, val_ratings] = 0
      validation[user, val_ratings] = ratings[user, val_ratings]
  return train, validation

In [None]:
dtf_train, dtf_test = train_test_split(dtf_users)

In [None]:
dtf_train = pd.DataFrame(dtf_train)

In [None]:
dtf_test = pd.DataFrame(dtf_test)

In [None]:
dtf_test.index = range(0,dtf_test.shape[0])
dtf_train.index = range(0,dtf_train.shape[0])

In [None]:
dtf_train.columns = range(0,len(dtf_train.columns))
dtf_test.columns = range(0,len(dtf_test.columns))

In [None]:
dtf_users = pd.DataFrame(dtf_users)

In [None]:
dtf_users.replace(0, np.nan, inplace=True)

In [None]:
dtf_train.replace(0, np.nan, inplace=True)

In [None]:
dtf_test.replace(0, np.nan, inplace=True)

###### Model Design & Testing

###### Data

In [None]:
split = 361

In [None]:
train1 = dtf_train.iloc[:,:360].stack(dropna=True).reset_index().rename(columns={0:"y"})

In [None]:
train2 = dtf_train.iloc[:,split:2*split-1].stack(dropna=True).reset_index().rename(columns={0:"y"})

In [None]:
train3 = dtf_train.iloc[:,2*split:split*3-1].stack(dropna=True).reset_index().rename(columns={0:"y"})

In [None]:
train4 = dtf_train.iloc[:,split*3:].stack(dropna=True).reset_index().rename(columns={0:"y"})

In [None]:
train = pd.concat([train1,train2,train3,train4])
train.columns = ["user","product","y"]
train.head()

Unnamed: 0,user,product,y
0,2,162,4.0
1,8,247,4.0
2,9,25,4.0
3,10,25,5.0
4,11,25,4.0


In [None]:
test = dtf_test.stack(dropna=True).reset_index().rename(columns={0:"y"})
test.columns = ["user","product","y"]
test.head()

Unnamed: 0,user,product,y
0,2,482,5.0
1,2,688,5.0
2,8,25,4.0
3,8,1415,5.0
4,11,244,3.0


###### Model Design

In [None]:
embeddings_size = 50
usr, prd = dtf_users.shape[0], dtf_users.shape[1]

# Input layer
xusers_in = layers.Input(name="xusers_in", shape=(1,))
xproducts_in = layers.Input(name="xproducts_in", shape=(1,))

# A) Matrix Factorization
## embeddings and reshape
cf_xusers_emb = layers.Embedding(name="cf_xusers_emb", input_dim=usr, output_dim=embeddings_size)(xusers_in)
cf_xusers = layers.Reshape(name='cf_xusers', target_shape=(embeddings_size,))(cf_xusers_emb)
## embeddings and reshape
cf_xproducts_emb = layers.Embedding(name="cf_xproducts_emb", input_dim=prd, output_dim=embeddings_size)(xproducts_in)
cf_xproducts = layers.Reshape(name='cf_xproducts', target_shape=(embeddings_size,))(cf_xproducts_emb)
## product
cf_xx = layers.Dot(name='cf_xx', normalize=False, axes=1)([cf_xusers, cf_xproducts])

# B) Neural Network
## embeddings and reshape
nn_xusers_emb = layers.Embedding(name="nn_xusers_emb", input_dim=usr, output_dim=embeddings_size)(xusers_in)
nn_xusers = layers.Reshape(name='nn_xusers', target_shape=(embeddings_size,))(nn_xusers_emb)
## embeddings and reshape
nn_xproducts_emb = layers.Embedding(name="nn_xproducts_emb", input_dim=prd, output_dim=embeddings_size)(xproducts_in)
nn_xproducts = layers.Reshape(name='nn_xproducts', target_shape=(embeddings_size,))(nn_xproducts_emb)
## concat and dense
nn_xx = layers.Concatenate()([nn_xusers, nn_xproducts])
nn_xx = layers.Dense(name="nn_xx", units=int(embeddings_size/2), activation='relu')(nn_xx)

# Merge A & B
y_out = layers.Concatenate()([cf_xx, nn_xx])
y_out = layers.Dense(name="y_out", units=1, activation='linear')(y_out)

# Compile
model = models.Model(inputs=[xusers_in,xproducts_in], outputs=y_out, name="Neural_CollaborativeFiltering")
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[metrics.MeanSquaredError()])
model.summary()

Model: "Neural_CollaborativeFiltering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 xusers_in (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 xproducts_in (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 nn_xusers_emb (Embedding)      (None, 1, 50)        12495050    ['xusers_in[0][0]']              
                                                                                                  
 nn_xproducts_emb (Embedding)   (None, 1, 50)        90250       ['xproducts_in[0][0]']           
                                                                      

###### Train / Test

In [None]:
'''
Plot loss and metrics of keras training.
'''
def utils_plot_keras_training(training):
    metrics = ['mean_squared_error']
    fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(15,3))

    ## training
    ax[0].set(title="Training")
    ax11 = ax[0]
    ax[0].set_xlabel('Epochs')
    for metric in metrics:
        ax11.plot(training.history[metric], label=metric)
    ax11.legend()

    ## validation
    ax[1].set(title="Validation")
    ax22 = ax[1].twinx()
    ax[1].set_xlabel('Epochs')
    for metric in metrics:
        ax22.plot(training.history['val_'+metric], label=metric)
    ax22.legend()
    plt.show()

In [None]:
# train
training = model.fit(x=[train["user"], train["product"]], y=train["y"],
                     epochs=40, batch_size=128, shuffle=True, verbose=0, validation_split=0.3)
model = training.model
utils_plot_keras_training(training)

In [None]:
# test
test["yhat"] = model.predict([test["user"], test["product"]])
test_result = pd.DataFrame(test)



In [None]:
test_result.head()

Unnamed: 0,user,product,y,yhat
0,2,482,5.0,4.363447
1,2,688,5.0,4.386676
2,8,25,4.0,4.238999
3,8,1415,5.0,4.301455
4,11,244,3.0,3.527523


###### Evaluate

In [None]:
positive_count = 0
negative_count = 0
same_rating_count = 0
total_count = 0
for i in range(int(test_result.shape[0]/2)):
  total_count += 2
  user = test_result.iloc[i*2].user
  test_case = {'user':[user for _ in range(0,1805)],'product':[i for i in range(0,1805)]}
  test_case = pd.DataFrame(test_case)
  test_case["yhat"] = model.predict([test_case["user"], test_case["product"]])
  test_top6 = test_case.sort_values(by = ['yhat'],ascending = False).iloc[:6]
  restaurant_index = top6.index.tolist()
  rest_1 = test_result['product'].iloc[i*2]
  rest_2 = test_result['product'].iloc[i*2+1]
  if rest_1 in restaurant_index:
    positive_count += 1
  if rest_2 in restaurant_index:
    positive_count += 1
print('postive')
print(positive_count)
print('total')
print(total_count)

In [None]:
positive_count = 0
negative_count = 0
same_rating_count = 0
total_count = 0
for i in range(int(test_result.shape[0]/2)):
  rating_1 = test_result.iloc[i*2].y
  rating_2 = test_result.iloc[i*2+1].y
  model_result_1 = test_result.iloc[i*2].yhat
  model_result_2 = test_result.iloc[i*2+1].yhat
  if rating_1 == rating_2:
    same_rating_count += 1
    continue
  if (rating_1 > rating_2) and (model_result_1 >= model_result_2):
    positive_count += 1
  elif (rating_1 < rating_2) and (model_result_1 <= model_result_2):
    positive_count += 1
  elif (rating_1 > rating_2) and (model_result_1 < model_result_2):
    negative_count += 1
  elif (rating_1 < rating_2) and (model_result_1 > model_result_2):
    negative_count += 1
print('postive')
print(positive_count)
print('negative')
print(negative_count)
print('same ratings')
print(same_rating_count)
print('total')
print(total_count)

postive
14013
negative
10694
same ratings
18733
total
0


In [None]:
# overall
y_test = test_result["y"]
predicted = test_result["yhat"]
print(metrics.mean_squared_error(y_test, predicted))