# Recommendation system with NCF Model

# Table of contents

- [1 - Packages](#1)
- [2 - Load and Preprocess data](#2)
- [3 - Define model](#3)
- [4 - Result](#4)


<a name='1'></a>

# 1 - Packages

In [65]:
import csv
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

<a name='2'></a>

# 2 - Load and preprocess data

### Read data from csv 

In [66]:
df = pd.read_csv("./Dataset/all_catagories/Processed_all_datas.csv")

In [67]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,US,51632887,R3B581VNKYCP16,B00HFPOXM4,114966677,Garmin Vivofit Fitness Band,Wireless,5.0,5854.0,6063.0,N,Y,2014-03-12
1,US,49452274,RT0KPKVSQD0HI,B00A17IAO0,979081054,UP by Jawbone Wristband,Wireless,5.0,4856.0,5029.0,N,Y,2012-11-27
2,US,44086587,RYBUAAD9JZ1VW,B001S2RCWI,835787812,Garmin Portable Friction Dashboard Mount,Wireless,5.0,4017.0,4048.0,N,Y,2009-06-14
3,US,49452274,R2Z0F95XGL71C6,B00GOGV314,190508754,"UP24 by Jawbone Wristband, Retail Packaging",Wireless,5.0,3914.0,4022.0,N,N,2013-12-08
4,US,18464808,RZ0J3PVMPU4CJ,B00DGEGJ02,212863722,"Wemo Wi-Fi enabled, Works with Amazon Alexa",Wireless,1.0,3667.0,3894.0,N,Y,2013-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8595,US,49235154,RZBO5U8IYTG1V,0393315703,194483165,The Blind Watchmaker: Why the Evidence of Evol...,Books,5.0,857.0,983.0,N,N,2001-09-11
8596,US,48890006,RPP8P15M3U2JL,0879838167,472643259,Nutrition and Physical Degeneration,Books,5.0,961.0,980.0,N,Y,2004-01-21
8597,US,20953285,R3C1BTS75WHVUQ,088404632X,156656124,Dianetics: The Modern Science of Mental Health,Books,5.0,36.0,980.0,N,N,2004-02-16
8598,US,50823784,R2MXU0D7H4MPX8,0465016901,241968441,The Drama of the Gifted Child: The Search for ...,Books,4.0,950.0,980.0,N,N,2000-11-07


In [68]:
df["product_id"].sort_values().unique()

array(['0029146739', '0060188707', '0060194480', ..., 'BT00DDC7BK',
       'BT00DDC7CE', 'BT00DDVMVQ'], dtype=object)

In [69]:
df["customer_id"].sort_values().unique()


array([   30680,    69547,    85415, ..., 53095900, 53095958, 53096553])

### Convert item IDs and user IDs to numerical IDs

In [70]:
#get original id from mapped id
def get_original_id(id, mapping_dict):
  for original_id, mapped_id in mapping_dict.items():
      if mapped_id == id:
          return original_id

In [71]:
#get df with original customer id and product id
def get_original_df(df):
  new_df = df.copy()
  new_df['customer_id'] = new_df['customer_id'].apply(get_original_id, args=(customer_dict,))
  new_df['product_id'] = new_df['product_id'].apply(get_original_id, args=(item_dict,))
  return new_df

In [72]:
# Convert word character item IDs to numerical IDs
item_dict = {}
count = 0
new_df = df.copy()
for item in df["product_id"].sort_values().unique():
    item_dict[item] = count
    count += 1
new_df["product_id"] = new_df["product_id"].apply(lambda x: item_dict[x])

In [73]:
new_df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,US,51632887,R3B581VNKYCP16,5231,114966677,Garmin Vivofit Fitness Band,Wireless,5.0,5854.0,6063.0,N,Y,2014-03-12
1,US,49452274,RT0KPKVSQD0HI,4530,979081054,UP by Jawbone Wristband,Wireless,5.0,4856.0,5029.0,N,Y,2012-11-27
2,US,44086587,RYBUAAD9JZ1VW,2186,835787812,Garmin Portable Friction Dashboard Mount,Wireless,5.0,4017.0,4048.0,N,Y,2009-06-14
3,US,49452274,R2Z0F95XGL71C6,5194,190508754,"UP24 by Jawbone Wristband, Retail Packaging",Wireless,5.0,3914.0,4022.0,N,N,2013-12-08
4,US,18464808,RZ0J3PVMPU4CJ,4930,212863722,"Wemo Wi-Fi enabled, Works with Amazon Alexa",Wireless,1.0,3667.0,3894.0,N,Y,2013-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8595,US,49235154,RZBO5U8IYTG1V,52,194483165,The Blind Watchmaker: Why the Evidence of Evol...,Books,5.0,857.0,983.0,N,N,2001-09-11
8596,US,48890006,RPP8P15M3U2JL,146,472643259,Nutrition and Physical Degeneration,Books,5.0,961.0,980.0,N,Y,2004-01-21
8597,US,20953285,R3C1BTS75WHVUQ,147,156656124,Dianetics: The Modern Science of Mental Health,Books,5.0,36.0,980.0,N,N,2004-02-16
8598,US,50823784,R2MXU0D7H4MPX8,71,241968441,The Drama of the Gifted Child: The Search for ...,Books,4.0,950.0,980.0,N,N,2000-11-07


In [74]:
# Convert customer IDs to another range of value
customer_dict = {}
count = 0
for customer in df["customer_id"].sort_values().unique():
    customer_dict[customer] = count
    count += 1
new_df["customer_id"] = new_df["customer_id"].apply(lambda x: customer_dict[x])


In [75]:
new_df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,US,6912,R3B581VNKYCP16,5231,114966677,Garmin Vivofit Fitness Band,Wireless,5.0,5854.0,6063.0,N,Y,2014-03-12
1,US,6027,RT0KPKVSQD0HI,4530,979081054,UP by Jawbone Wristband,Wireless,5.0,4856.0,5029.0,N,Y,2012-11-27
2,US,5022,RYBUAAD9JZ1VW,2186,835787812,Garmin Portable Friction Dashboard Mount,Wireless,5.0,4017.0,4048.0,N,Y,2009-06-14
3,US,6027,R2Z0F95XGL71C6,5194,190508754,"UP24 by Jawbone Wristband, Retail Packaging",Wireless,5.0,3914.0,4022.0,N,N,2013-12-08
4,US,1849,RZ0J3PVMPU4CJ,4930,212863722,"Wemo Wi-Fi enabled, Works with Amazon Alexa",Wireless,1.0,3667.0,3894.0,N,Y,2013-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8595,US,5970,RZBO5U8IYTG1V,52,194483165,The Blind Watchmaker: Why the Evidence of Evol...,Books,5.0,857.0,983.0,N,N,2001-09-11
8596,US,5885,RPP8P15M3U2JL,146,472643259,Nutrition and Physical Degeneration,Books,5.0,961.0,980.0,N,Y,2004-01-21
8597,US,2116,R3C1BTS75WHVUQ,147,156656124,Dianetics: The Modern Science of Mental Health,Books,5.0,36.0,980.0,N,N,2004-02-16
8598,US,6538,R2MXU0D7H4MPX8,71,241968441,The Drama of the Gifted Child: The Search for ...,Books,4.0,950.0,980.0,N,N,2000-11-07


### Split train,test data and generate negative example

In [76]:
test_ratio = 0.2
train_data, test_data = train_test_split(new_df, test_size=test_ratio,shuffle= True,random_state=42)
train_data=train_data[["customer_id","product_id","star_rating","verified_purchase"]]
test_data=test_data[["customer_id","product_id","star_rating","verified_purchase"]]
train_data["verified_purchase"] = train_data["verified_purchase"].apply(lambda x: 1 if x == "Y" else 0)
test_data["verified_purchase"] = test_data["verified_purchase"].apply(lambda x: 1 if x == "Y" else 0)

In [77]:
# Create a function to generate negative samples
def generate_negative_samples(user, num_samples, all_items, rated_items):
    unrated_items = list(all_items - rated_items)
    items = np.random.choice(unrated_items, size=num_samples, replace=False)
    return pd.DataFrame({
        "customer_id": [user] * num_samples,
        "product_id": items,
        "star_rating": [0] * num_samples,
        "verified_purchase": [0] * num_samples
    })
    
num_negative_samples = 4
all_items = set(range(len(item_dict)))
train_neg_data = []
for user in train_data["customer_id"].unique():
    user_data = train_data[train_data["customer_id"] == user]
    rated_items = set(user_data["product_id"].values)
    neg_data = generate_negative_samples(user, num_negative_samples, all_items, rated_items)
    train_neg_data.append(neg_data)
train_neg_data = pd.concat(train_neg_data)
train_data = pd.concat([train_data, train_neg_data])
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [78]:
train_data


Unnamed: 0,customer_id,product_id,star_rating,verified_purchase
0,5854,1615,0.0,0
1,1719,4195,0.0,0
2,5245,1739,4.0,1
3,751,2743,0.0,0
4,1110,828,0.0,0
...,...,...,...,...
32631,6323,475,0.0,0
32632,5017,5237,0.0,0
32633,566,2880,0.0,0
32634,6168,5043,5.0,1


### Export train and test data to csv

In [79]:
train_data.to_csv("./Dataset/NeuMF_data/neumf_train_data.csv", index=False)
test_data.to_csv("./Dataset/NeuMF_data/neumf_test_data.csv", index=False)

<a name='3'></a>

# 3 - Define model

In [80]:
# Load and preprocess data
train_data = pd.read_csv('./Dataset/NeuMF_data/neumf_train_data.csv')
test_data = pd.read_csv('./Dataset/NeuMF_data/neumf_test_data.csv')

#remove data with 0s rating
train_data_rating = train_data[train_data['star_rating'] != 0]
test_data_rating = test_data[test_data['star_rating'] != 0]

concat_df = pd.concat([train_data, test_data], axis=0)
num_users, num_items = len(concat_df.customer_id.unique()), len(concat_df.product_id.unique())

In [81]:
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0, model_type='predict_rating'):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])


    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    activation = 'sigmoid'
    output_units = 1

    if model_type == 'predict_rating': 
      activation = 'softmax'
      output_units = 5

    result = Dense(units=output_units, activation=activation,  # softmax for rating, sigmoid for verified_purchase
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))
    return model

In [82]:
# Define configuration
epochs = 5
batch_size = 64
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0.1, 0.1, 0, 0]
reg_mf = 0
learning_rate = 0.01
learner = 'adam'

In [83]:
# create model for predicting rating
#model_rating = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_rating')
model_rating = tf.keras.models.load_model('./Model_weight/model_rating.h5',compile=False)
model_rating.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])
print(model_rating.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 1, 32)       254976      ['user_input[0][0]']             
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 1, 32)       190272      ['item_input[0][0]']             
                                                                                              

In [84]:
# create model for predicting purchase rate
reg_layers = [0,0,0,0]
#model_vp = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_vp')

#Load the trained models
model_vp = tf.keras.models.load_model('./Model_weight/model_vp.h5',compile=False)
model_vp.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

print(model_rating.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 1, 32)       254976      ['user_input[0][0]']             
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 1, 32)       190272      ['item_input[0][0]']             
                                                                                              

In [85]:
#convert the labels to onehot
train_labels = to_categorical(np.array(train_data.star_rating)-1, num_classes=5)
test_labels = to_categorical(np.array(test_data.star_rating)-1, num_classes=5)

<a name='4'></a>

# 4 - Result


### Result of predicting-purchase status model

In [86]:
model_vp.evaluate([np.array(test_data.customer_id), np.array(test_data.product_id)]
                  , np.array(test_data.verified_purchase))




[0.9114682674407959, 0.5779069662094116]

### Result of predicting rating  model

In [87]:
model_rating.evaluate([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          test_labels)



[1.3670721054077148, 0.5279069542884827]

### Make recommendation for a user id

In [88]:
def get_recommendations(model, desired_user_id):
      mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

      # get the latent embedding for the desired user
      user_latent_matrix = mlp_user_embedding_weights[0]
        
      one_user_vector = user_latent_matrix[desired_user_id,:]
      one_user_vector = np.reshape(one_user_vector, (1,32))

      #cluster users into 100 clusters
      kmeans = KMeans(n_clusters=100, random_state=0, n_init='auto', verbose=0).fit(user_latent_matrix)

      #predict the cluster that the desired user belongs to
      desired_user_label = kmeans.predict(one_user_vector)
      user_label = kmeans.labels_ #no of user_ids

      #get user_ids in same cluster
      neighbors = []
      for user_id, user_label in enumerate(user_label):
          if user_label == desired_user_label:
              neighbors.append(user_id)

      #get the corresponding product ids of those users
      product_ids = []
      for user_id in neighbors:
          product_ids += list(train_data[train_data['customer_id'] == int(user_id)]['product_id'])

      product_ids = list(set(product_ids))

      users = np.full(len(product_ids), desired_user_id, dtype='int32')
      items = np.array(product_ids, dtype='int32')

      #make predictions on the users and products
      predictions = model.predict([users,items],batch_size=100, verbose=0) 
      return product_ids, predictions
    
def round_predictions(arr):
    result = arr.copy()
    result[result < 0] = 0
    result[result < 0.5] = np.floor(result[result < 0.5])
    result[result >= 0.5] = np.ceil(result[result >= 0.5])
    result[result > 5] = 5
    return result   

def get_final_result(model_rating,model_vp, desired_user_id):
    # get predicted product id and rating 
    product_ids_rating, predictions_rating = get_recommendations(model_rating, desired_user_id)
    
    # Convert the predictions from a probability distribution to a single integer between 1 and 5 (inclusive)
    predictions_rating = np.argmax(predictions_rating, axis=1) + 1
    
    # Create a DataFrame from the predicted product ids and star ratings
    results_rating_df = pd.DataFrame({'product_id': product_ids_rating, 'star_rating': predictions_rating})
    results_rating_df = results_rating_df.sort_values(by='star_rating', ascending=False)
    
    # Get predicted product ids and verified purchase status 
    product_ids_vp, predictions_vp = get_recommendations(model_vp, desired_user_id)
    
    # Round the predicted verified purchase status to either 0 or 1
    predictions_vp = [i[0] for i in predictions_vp]
    predictions_vp = round_predictions(np.array(predictions_vp))
    
    # Create a DataFrame from the predicted product ids and verified purchase status
    results_vp_df = pd.DataFrame({'product_id': product_ids_vp, 'verified_purchase': predictions_vp})
    results_vp_df = results_vp_df.sort_values(by='verified_purchase', ascending=False)
    
    # get only result > 3.0 rating and verified status is 1
    predicted_final_result = pd.merge(results_rating_df[results_rating_df["star_rating"] >= 3.0 ],
                                      results_vp_df[results_vp_df["verified_purchase"] == 1.0])
    # Drop duplicate product id in processed data above to extract only product related information
    undup_new_df = new_df.drop_duplicates(subset = "product_id")
    final_result = pd.merge(predicted_final_result["product_id"],undup_new_df, on=["product_id"])
    return final_result,predicted_final_result,results_vp_df,results_rating_df

In [89]:
desired_user_id = 500
final_result,predicted_final_result,results_vp_df,results_rating_df = get_final_result(model_rating,model_vp, desired_user_id)
results_vp_df

Unnamed: 0,product_id,verified_purchase
0,4737,1.0
33,5821,1.0
35,453,1.0
36,4678,1.0
37,1350,1.0
...,...,...
27,4266,1.0
28,1193,1.0
29,430,1.0
30,1329,1.0


In [90]:
results_rating_df

Unnamed: 0,product_id,star_rating
0,1024,5
153,2830,5
140,5877,5
143,5373,5
144,1278,5
...,...,...
95,674,1
32,583,1
110,5820,1
186,5436,1


In [91]:
predicted_final_result

Unnamed: 0,product_id,star_rating,verified_purchase
0,2267,5,1.0
1,1444,5,1.0
2,3320,5,1.0
3,2069,4,1.0
4,970,4,1.0
5,1391,4,1.0


In [92]:
final_result

Unnamed: 0,product_id,marketplace,customer_id,review_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,2267,US,3937,R2Q8UK3PHI96TL,558600272,"Fellowes Powershred W11C, 11-Sheet Cross-cut P...",Office Products,4.0,1403.0,1417.0,N,N,2010-03-07
1,1444,US,4712,R2TW3DCXN21VGF,895316207,"Intex River Run I Sport Lounge, Inflatable Wat...",Toys,5.0,1196.0,1239.0,N,Y,2011-03-23
2,3320,US,5327,R4K2Y6W0PHIIJ,777109513,Curt Cargo Rack Cargo Rack Extension Cargo Net...,Automotive,1.0,689.0,707.0,N,Y,2014-05-19
3,2069,US,1572,R14T1LK2CLFKDM,842617404,Rain for Sleeping and Relaxation,Digital_Music_Purchase,5.0,277.0,282.0,N,N,2008-11-16
4,970,US,6230,RU7X8MKRVL6JC,83009018,"X-Mat Original Pet Training Mat, Firm, 18-Inch",Pet Products,2.0,726.0,730.0,N,Y,2012-11-10
5,1391,US,2980,RCIG4N9SZJS9W,853720270,Schlage Camelot Keypad Deadbolt,Home Improvement,5.0,814.0,839.0,N,Y,2008-12-19


In [93]:
#df with actual original customer id and product id
new_result = get_original_df(final_result)
new_result

Unnamed: 0,product_id,marketplace,customer_id,review_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,B0026I2HLO,US,36730202,R2Q8UK3PHI96TL,558600272,"Fellowes Powershred W11C, 11-Sheet Cross-cut P...",Office Products,4.0,1403.0,1417.0,N,N,2010-03-07
1,B000PEOMC8,US,42213107,R2TW3DCXN21VGF,895316207,"Intex River Run I Sport Lounge, Inflatable Wat...",Toys,5.0,1196.0,1239.0,N,Y,2011-03-23
2,B004O844DS,US,45701078,R4K2Y6W0PHIIJ,777109513,Curt Cargo Rack Cargo Rack Extension Cargo Net...,Automotive,1.0,689.0,707.0,N,Y,2014-05-19
3,B001K5AAJA,US,16589159,R14T1LK2CLFKDM,842617404,Rain for Sleeping and Relaxation,Digital_Music_Purchase,5.0,277.0,282.0,N,N,2008-11-16
4,B0009YD7M0,US,50048341,RU7X8MKRVL6JC,83009018,"X-Mat Original Pet Training Mat, Firm, 18-Inch",Pet Products,2.0,726.0,730.0,N,Y,2012-11-10
5,B000NJJ1MQ,US,29087249,RCIG4N9SZJS9W,853720270,Schlage Camelot Keypad Deadbolt,Home Improvement,5.0,814.0,839.0,N,Y,2008-12-19


In [94]:
print("Number of positive rating recommend : ",len(new_result[new_result["star_rating"] >= 3.0]))
print("Number of negative rating recommend : ",len(new_result[new_result["star_rating"] < 3.0]))
stat_rating =len(new_result[new_result["star_rating"] >= 3.0])-len(new_result[new_result["star_rating"] < 3.0]) 
print("Difference proportion : ",stat_rating)

Number of positive rating recommend :  4
Number of negative rating recommend :  2
Difference proportion :  2


In [95]:
print("Number of positive rating purchase status : ",len(new_result[new_result["verified_purchase"] == "Y"]))
print("Number of negative rating purchase status : ",len(new_result[new_result["verified_purchase"] == "N"]))
stat_rating =len(new_result[new_result["verified_purchase"] == "Y"])-len(new_result[new_result["verified_purchase"] == "N"]) 
print("Difference proportion : ",stat_rating)

Number of positive rating purchase status :  4
Number of negative rating purchase status :  2
Difference proportion :  2


In [111]:
total_pos_rating = 0
total_neg_rating = 0
total_pos_pur = 0
total_neg_pur = 0

header = [
    "user","total_pos_rating","total_neg_rating","total_pos_pur","total_neg_pur"
]

print(f"processing {num_users} records... ", end="")
file_name = "NCF_CHECKPOINT.csv"
cp_exists = os.path.exists(file_name);
cont_row = 0
if cp_exists:
    with open(file_name, "r") as f:
        cont_row = len([line for line in f]) - 1 # one for header, one for index
        print(f"continuing from user {cont_row + 1}")
else:
    print("")

def save(i, tpr, tnr, tpp, tnp):
    global cp_exists, header

    # Define the data rows
    data = [i, tpr, tnr, tpp, tnp]
    
    # Check if the file exists
    if cp_exists:
        # If it exists, open it in append mode
        with open(file_name, "a") as f:
            writer = csv.writer(f)
            writer.writerow(data)
    else:
        # If it does not exist, open it in write mode
        with open(file_name, "w") as f:
            writer = csv.writer(f)
            writer.writerow(header)
            writer.writerow(data)
        # Update the checkpoint to exists
        cp_exists = True
        

def checkpoint(i):
    global total_pos_rating,total_neg_rating,total_pos_pur,total_neg_pur

    final_result,predicted_final_result,results_vp_df,results_rating_df = get_final_result(model_rating,
                                                                                           model_vp,
                                                                                           i)
    new_result = get_original_df(final_result)
    total_pos_rating += len(new_result[new_result["star_rating"] >= 3.0])
    total_neg_rating += len(new_result[new_result["star_rating"] < 3.0])
    total_pos_pur += len(new_result[new_result["verified_purchase"] == "Y"])
    total_neg_pur += len(new_result[new_result["verified_purchase"] == "N"])

    save(
        i, 
        total_pos_rating, total_neg_rating, total_pos_pur, total_neg_pur
    )

    print(f"{i+1}.", end="")

for i in range(cont_row, num_users):
    checkpoint(i)

print("done.")
    

processing 3 records... continuing from user 2
2.3.done.


In [97]:
print(total_pos_rating)
print(total_neg_rating)
print(total_pos_pur)
print(total_neg_pur)


1967
486
1613
840


In [98]:
print(f"proportion positive rating recommend for first {num_users} user:{total_pos_rating/(total_neg_rating+total_pos_rating) *100} %")
print(f"proportion positive purchase status recommend first {num_users} user:{total_pos_pur/(total_neg_pur+total_pos_pur) *100} %")

proportion positive rating recommend for first 7968 user:80.1875254790053 %
proportion positive purchase status recommend first 7968 user:65.7562168772931 %
