In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

In [2]:
# Load and preprocess data
train_data = pd.read_csv('neumf_train_data.csv')
test_data = pd.read_csv('neumf_test_data.csv')

concat_df = pd.concat([train_data, test_data], axis=0)
num_users, num_items = len(concat_df.customer_id.unique()), len(concat_df.product_id.unique())

In [3]:
interactions = pd.pivot_table(concat_df, values='star_rating', index='customer_id', columns='product_id')
interactions = interactions.fillna(0)
print(interactions)
# interactions = np.array(interactions)

product_id   0     1     2     3     4     5     6     7     8     9     ...  \
customer_id                                                              ...   
0             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
7963          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7964          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7965          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7966          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7967          0.0   0.0   0.0   0.0   0.

In [None]:
# for user_id, user_ratings in interactions.iterrows():
#     if (user_ratings > 0).sum() > 1:
#         print("User ID:", user_id)
#         print(user_ratings[user_ratings > 0])

In [5]:
print(num_users, num_items)

7968 5946


In [4]:
# full NCF model
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0, model_type='predict_rating'):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])


    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    activation = 'sigmoid'
    if model_type == 'predict_rating': activation = 'linear'
    result = Dense(1, activation=activation,  # linear for rating, sigmoid for verified_purchase
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))
    return model

In [5]:
epochs = 10
verbose = 1
batch_size = 64
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = 0
learning_rate = 0.001
learner = 'adam'

In [6]:
# get model
model_rating = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_rating')
model_rating.load_weights('model_rating_weights.h5')
model_rating.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error', metrics=['accuracy'])
print(model_rating.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 1, 32)       254976      ['user_input[0][0]']             
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 1, 32)       190272      ['item_input[0][0]']             
                                                                                              

In [7]:
model_verified_purchase = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf, model_type='predict_vf')
model_verified_purchase.load_weights('model_vp_weights.h5')
model_verified_purchase.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
# Train rating model
model_rating.fit(x=[np.array(train_data.customer_id), np.array(train_data.product_id)],
                y=np.array(train_data.star_rating),
                validation_data=([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          np.array(test_data.star_rating)),
                batch_size=batch_size,
                epochs=epochs,
                verbose=1,
                shuffle=True)
model_rating.save_weights('model_rating_weights.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
# Train verified purchase model
model_verified_purchase.fit(x=[np.array(train_data.customer_id), np.array(train_data.product_id)],
                y=np.array(train_data.verified_purchase), 
                validation_data=([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          np.array(test_data.verified_purchase)), 
                batch_size=batch_size,
                epochs=epochs,
                verbose=1,
                shuffle=True)
model_verified_purchase.save_weights('model_vp_weights.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
model = model_rating

In [50]:
predictions = model.predict([np.array(train_data.customer_id), np.array(train_data.product_id)])
for i in range(30,70):
  print(f"riu: {train_data.star_rating.iloc[i]}, predicted: {(predictions[i][0])}")

riu: 0.0, predicted: -0.0038194656372070312
riu: 0.0, predicted: 0.03334152698516846
riu: 0.0, predicted: 0.056348204612731934
riu: 0.0, predicted: -0.004281967878341675
riu: 0.0, predicted: 0.0808018147945404
riu: 0.0, predicted: 0.01619453728199005
riu: 5.0, predicted: 4.8797688484191895
riu: 4.0, predicted: 3.67042875289917
riu: 0.0, predicted: 0.3649331331253052
riu: 0.0, predicted: 0.07992932200431824
riu: 1.0, predicted: 0.9341310858726501
riu: 0.0, predicted: -0.0077904462814331055
riu: 0.0, predicted: -0.009457945823669434
riu: 0.0, predicted: 0.10676541924476624
riu: 0.0, predicted: -0.019530057907104492
riu: 0.0, predicted: 0.0434286892414093
riu: 0.0, predicted: 0.14926490187644958
riu: 0.0, predicted: 0.01722368597984314
riu: 4.0, predicted: 4.902644157409668
riu: 0.0, predicted: 0.09040513634681702
riu: 0.0, predicted: 0.00869092345237732
riu: 0.0, predicted: 0.0977519303560257
riu: 0.0, predicted: 0.16449418663978577
riu: 5.0, predicted: 4.703673362731934
riu: 0.0, predic

Recommendation

In [21]:
def get_recommendations(model, desired_user_id):
  mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

  # get the latent embedding for the desired user
  user_latent_matrix = mlp_user_embedding_weights[0]

  one_user_vector = user_latent_matrix[desired_user_id,:]
  one_user_vector = np.reshape(one_user_vector, (1,32))

  #cluster users into 100 clusters
  kmeans = KMeans(n_clusters=100, random_state=0, n_init='auto', verbose=0).fit(user_latent_matrix)

  #predict the cluster that the desired user belongs to
  desired_user_label = kmeans.predict(one_user_vector)
  user_label = kmeans.labels_ #no of user_ids
  
  #get user_ids in same cluster
  neighbors = []
  for user_id, user_label in enumerate(user_label):
      if user_label == desired_user_label:
          neighbors.append(user_id)
  
  #get the corresponding product ids of those users
  product_ids = []
  for user_id in neighbors:
      product_ids += list(train_data[train_data['customer_id'] == int(user_id)]['product_id'])

  users = np.full(len(product_ids), desired_user_id, dtype='int32')
  items = np.array(product_ids, dtype='int32')

  #make predictions on the users and products
  predictions = model.predict([users,items],batch_size=100, verbose=0) 
  predictions = predictions.tolist()
  predictions = [i[0] for i in predictions]
  predictions = round_predictions(np.array(predictions))
  return product_ids, predictions

In [20]:
def round_predictions(arr):
    result = arr.copy()
    result[result < 0] = 0
    result[result < 0.5] = np.floor(result[result < 0.5])
    result[result >= 0.5] = np.ceil(result[result >= 0.5])
    result[result > 5] = 5
    return result

In [58]:
#user id to recommend the products to
desired_user_id = train_data.customer_id[2]
desired_user_id

2160

In [59]:
product_ids, predictions = get_recommendations(model_rating, desired_user_id)
results_df = pd.DataFrame({'product_id': product_ids, 'star_rating': predictions})
results_df = results_df.sort_values(by='star_rating', ascending=False)
#display recommended products in order using rating
results_df

Unnamed: 0,product_id,star_rating
141,664,5.0
45,2657,5.0
57,742,5.0
33,724,5.0
116,5274,5.0
...,...,...
22,3337,0.0
21,3442,0.0
148,4473,0.0
150,3493,0.0


In [60]:
product_ids, predictions = get_recommendations(model_verified_purchase, desired_user_id)
results_df = pd.DataFrame({'product_id': product_ids, 'verified_purchase': predictions})
results_df = results_df.sort_values(by='verified_purchase', ascending=False)
#display recommended products in order using verified_purchase
results_df

Unnamed: 0,product_id,verified_purchase
287,338,1.0
85,1084,1.0
302,4371,1.0
79,3062,1.0
154,3801,1.0
...,...,...
134,5846,0.0
133,3966,0.0
132,5471,0.0
131,4430,0.0
