In [631]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

In [632]:
# Load and preprocess data
train_data = pd.read_csv('neumf_train_data.csv')
test_data = pd.read_csv('neumf_test_data.csv')

#remove data with 0s rating
train_data = train_data[train_data['star_rating'] != 0]
test_data = test_data[test_data['star_rating'] != 0]

concat_df = pd.concat([train_data, test_data], axis=0)
num_users, num_items = len(concat_df.customer_id.unique()), len(concat_df.product_id.unique())

In [633]:
train_data

Unnamed: 0,customer_id,product_id,star_rating,verified_purchase
0,6552,4052,1.0,1
7,7295,1416,5.0,0
8,6197,640,4.0,1
17,581,2623,5.0,1
18,7167,2323,3.0,1
...,...,...,...,...
32622,1751,3701,4.0,1
32631,7418,4317,1.0,1
32632,2888,4193,5.0,1
32634,3685,3768,5.0,0


In [634]:
num_users, num_items

(7968, 5946)

In [468]:
# interactions = pd.pivot_table(concat_df, values='interaction', index='customer_id', columns='product_id')
# interactions = interactions.fillna(0)
# print(interactions)
# interactions = np.array(interactions)

In [635]:
# full NCF model
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0, model_type='predict_rating'):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])


    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    activation = 'sigmoid'
    output_units = 1

    if model_type == 'predict_rating': 
      activation = 'softmax'
      output_units = 5

    result = Dense(units=output_units, activation=activation,  # softmax for rating, sigmoid for verified_purchase
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))
    return model

In [636]:
epochs = 5
batch_size = 64
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0.1, 0.1, 0, 0]
reg_mf = 0
learning_rate = 0.01
learner = 'adam'

In [637]:
# create rating model
model_rating = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_rating')
# model_rating.load_weights('model_rating_weights.h5')
model_rating.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])
print(model_rating.summary())

Model: "model_69"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 1, 32)       254976      ['user_input[0][0]']             
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 1, 32)       190272      ['item_input[0][0]']             
                                                                                           

In [638]:
#convert the labels to onehot
train_labels = to_categorical(np.array(train_data.star_rating)-1, num_classes=5)
test_labels = to_categorical(np.array(test_data.star_rating)-1, num_classes=5)

In [639]:
# Train rating model
model_rating.fit(x=[np.array(train_data.customer_id), np.array(train_data.product_id)],
                y=train_labels,
                validation_data=([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          test_labels),
                batch_size=batch_size,
                epochs=5,
                verbose=1)
model_rating.save_weights('model_rating_weights.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [640]:
# create verified_purchase model
reg_layers = [0,0,0,0]
model_vp = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_vp')
# model_vp.load_weights('model_vp_weights.h5')
model_vp.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [641]:
# Train verified_purchase model
model_vp.fit(x=[np.array(train_data.customer_id), np.array(train_data.product_id)],
                y=np.array(train_data.verified_purchase), 
                validation_data=([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          np.array(test_data.verified_purchase)), 
                batch_size=batch_size,
                epochs=epochs,
                verbose=1,
                shuffle=True)
model_vp.save_weights('model_vp_weights.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
predictions = model_rating.predict([np.array(test_data.customer_id), np.array(test_data.product_id)])
predictions = np.argmax(predictions, axis=1) + 1

print(predictions[3])
for i in range(80):
  print(f"riu: {test_data.star_rating.iloc[i]}, predicted: {predictions[i]}")

Recommendation

In [643]:
def get_recommendations(model, desired_user_id):
  mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

  # get the latent embedding for the desired user
  user_latent_matrix = mlp_user_embedding_weights[0]

  one_user_vector = user_latent_matrix[desired_user_id,:]
  one_user_vector = np.reshape(one_user_vector, (1,32))

  #cluster users into 100 clusters
  kmeans = KMeans(n_clusters=100, random_state=0, n_init='auto', verbose=0).fit(user_latent_matrix)

  #predict the cluster that the desired user belongs to
  desired_user_label = kmeans.predict(one_user_vector)
  user_label = kmeans.labels_ #no of user_ids
  
  #get user_ids in same cluster
  neighbors = []
  for user_id, user_label in enumerate(user_label):
      if user_label == desired_user_label:
          neighbors.append(user_id)
  
  #get the corresponding product ids of those users
  product_ids = []
  for user_id in neighbors:
      product_ids += list(train_data[train_data['customer_id'] == int(user_id)]['product_id'])

  product_ids = list(set(product_ids))

  users = np.full(len(product_ids), desired_user_id, dtype='int32')
  items = np.array(product_ids, dtype='int32')

  #make predictions on the users and products
  predictions = model.predict([users,items],batch_size=100, verbose=0) 
  return product_ids, predictions

In [644]:
def round_predictions(arr):
    result = arr.copy()
    result[result < 0] = 0
    result[result < 0.5] = np.floor(result[result < 0.5])
    result[result >= 0.5] = np.ceil(result[result >= 0.5])
    result[result > 5] = 5
    return result

In [651]:
#user id to recommend the products to
desired_user_id = int(train_data.iloc[69]['customer_id'])
desired_user_id

6691

In [654]:
product_ids, predictions = get_recommendations(model_rating, desired_user_id)
predictions = np.argmax(predictions, axis=1) + 1
results_df = pd.DataFrame({'product_id': product_ids, 'star_rating': predictions})
results_df = results_df.sort_values(by='star_rating', ascending=False)
#display recommended products in order using star_rating
results_df

Unnamed: 0,product_id,star_rating
1871,3028,5
3397,5410,5
1772,2863,5
1770,2858,5
1768,2850,5
...,...,...
2181,3505,1
2176,3499,1
2173,3496,1
2169,3491,1


In [653]:
product_ids, predictions = get_recommendations(model_vp, desired_user_id)
predictions = [i[0] for i in predictions]
predictions = round_predictions(np.array(predictions))
results_df = pd.DataFrame({'product_id': product_ids, 'verified_purchase': predictions})
results_df = results_df.sort_values(by='verified_purchase', ascending=False)
#display recommended products in order using verified_purchase
results_df

Unnamed: 0,product_id,verified_purchase
0,1538,1.0
102,3505,1.0
31,3701,1.0
99,2466,1.0
33,5757,1.0
...,...,...
44,4786,0.0
41,2720,0.0
40,1181,0.0
39,2715,0.0
