In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

In [3]:
# Load and preprocess data
train_data = pd.read_csv('neumf_train_data.csv')
test_data = pd.read_csv('neumf_test_data.csv')

# num_users = len(train_data.customer_id.unique()) + len(test_data.customer_id.unique())
# num_items = len(train_data.product_id.unique()) + len(test_data.product_id.unique())
concat_df = pd.concat([train_data, test_data], axis=0)
num_users, num_items = len(concat_df.customer_id.unique()), len(concat_df.product_id.unique())

In [4]:
interactions = pd.pivot_table(concat_df, values='star_rating', index='customer_id', columns='product_id')

interactions = interactions.fillna(0)

print(interactions)
# interactions = np.array(interactions)

product_id   0     1     2     3     4     5     6     7     8     9     ...  \
customer_id                                                              ...   
0             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4             0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
7963          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7964          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7965          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7966          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7967          0.0   0.0   0.0   0.0   0.

In [None]:
# for user_id, user_ratings in interactions.iterrows():
#     if (user_ratings > 0).sum() > 1:
#         print("User ID:", user_id)
#         print(user_ratings[user_ratings > 0])

In [None]:
print(num_users, num_items)

7968 5946


In [22]:
# full NCF model
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])


    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(1, activation='linear',  # linear for rating, sigmoid for verified_purchase
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))
    return model

In [23]:
epochs = 5
verbose = 1
batch_size = 64
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = 0
learning_rate = 0.001
learner = 'adam'

In [24]:
# get model
model_rating = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf)
model_rating.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error', metrics=['accuracy'])
print(model_rating.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 1, 32)       254976      ['user_input[0][0]']             
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 1, 32)       190272      ['item_input[0][0]']             
                                                                                            

In [7]:
model_verified_purchase = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf)
model_verified_purchase.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [25]:
# Train NeuMF model
model_rating.fit(x=[np.array(train_data.customer_id), np.array(train_data.product_id)],
                y=np.array(train_data.star_rating), # replace train_data.verified_purchase
                validation_data=([np.array(test_data.customer_id), np.array(test_data.product_id)],
                          np.array(test_data.star_rating)), # replace train_data.verified_purchase
                batch_size=batch_size,
                epochs=5,
                verbose=1,
                shuffle=True)
# neumf_model.save_weights('neumf_model_weights.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2a83e5ab50>

In [28]:
model = model_rating

In [43]:
predictions = model.predict([np.array(train_data.customer_id), np.array(train_data.product_id)])
for i in range(0,30):
  print(f"riu: {train_data.star_rating.iloc[i]}, predicted: {(predictions[i][0])}")

riu: 1.0, predicted: 0.5053096413612366
riu: 0.0, predicted: 0.27102208137512207
riu: 0.0, predicted: 0.10045319050550461
riu: 0.0, predicted: -0.045852310955524445
riu: 0.0, predicted: 0.12545758485794067
riu: 0.0, predicted: 0.18399500846862793
riu: 0.0, predicted: 0.13161689043045044
riu: 5.0, predicted: 5.182096004486084
riu: 4.0, predicted: 4.151077747344971
riu: 0.0, predicted: 0.13917802274227142
riu: 0.0, predicted: 0.23846611380577087
riu: 0.0, predicted: 0.10095985978841782
riu: 0.0, predicted: -0.2475355565547943
riu: 0.0, predicted: 0.10459420830011368
riu: 0.0, predicted: -0.15142807364463806
riu: 0.0, predicted: 0.05604039132595062
riu: 0.0, predicted: 0.05061178654432297
riu: 5.0, predicted: 4.196002006530762
riu: 3.0, predicted: 0.5994928479194641
riu: 5.0, predicted: 5.727824687957764
riu: 0.0, predicted: -0.19711759686470032
riu: 5.0, predicted: 2.114281177520752
riu: 0.0, predicted: -0.0212671160697937
riu: 5.0, predicted: 5.0653839111328125
riu: 0.0, predicted: 0.00

In [11]:
def recommend_items(user_id, items=10):
    item_ids = np.arange(items)
    user_ids = np.repeat(user_id, items)
    rating_predictions = model.predict([user_ids, item_ids])
    item_ratings = list(zip(item_ids, rating_predictions.flatten()))
    # item_ratings = [(item_id, rating, verified_purchase) for item_id, rating, verified_purchase in item_ratings
    #                 if verified_purchase >= 0.5 and rating > 3.0]
    item_ratings = [(item_id, rating) for item_id, rating in item_ratings]
    item_ratings.sort(key=lambda x: x[1], reverse=True)
    recommended_items = [(item_id, rating) for item_id, rating in item_ratings[:num_items]]
    return recommended_items

In [None]:
res = recommend_items(user_id=train_data.customer_id[0])
print(res)

[(3, 1.2953409), (4, 1.0114655), (6, 0.9815404), (8, 0.9178495), (0, 0.4740227), (1, 0.4111194), (7, 0.40578693), (5, 0.23016101), (2, 0.15012874), (9, -0.0016505271)]


Recommendation

In [30]:
#user id to recommend the products to
desired_user_id = train_data.customer_id[0]

In [31]:
from sklearn.cluster import KMeans
mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

# get the latent embedding for your desired user
user_latent_matrix = mlp_user_embedding_weights[0]

one_user_vector = user_latent_matrix[desired_user_id,:]

one_user_vector = np.reshape(one_user_vector, (1,32))
print(user_latent_matrix)

[[-0.04496388 -0.04672782 -0.11895617 ...  0.039933   -0.00231761
  -0.0595164 ]
 [ 0.09821941  0.06897451 -0.02122093 ... -0.0675341  -0.0385053
  -0.13302925]
 [-0.03060061 -0.01012711  0.09263764 ...  0.02950649  0.06810649
  -0.03205543]
 ...
 [ 0.04160957 -0.03649679  0.03119293 ... -0.04535455 -0.02716426
   0.02346848]
 [-0.00606452  0.00661458 -0.10501533 ...  0.12352349  0.06097689
   0.05976881]
 [ 0.05895165 -0.03409892 -0.06831917 ...  0.01020957 -0.06652234
   0.04239896]]


In [32]:
#kmeans 
kmeans = KMeans(n_clusters=100, random_state=0, n_init='auto', verbose=0).fit(user_latent_matrix)

In [33]:
desired_user_label = kmeans.predict(one_user_vector)
print(desired_user_label)

[18]


In [34]:
user_label = kmeans.labels_ #no of user_ids
print(user_label)
neighbors = []

#get user_ids in same cluster
for user_id, user_label in enumerate(user_label):
    if user_label == desired_user_label:
        neighbors.append(user_id)
print('Found {0} neighbor users.'.format(len(neighbors))) 

[37  0 21 ... 39 13 55]
Found 83 neighbor users.


In [35]:
# product_ids = train_data.loc[train_data['customer_id'].isin(neighbors), 'product_id'].values
product_ids = []

for user_id in neighbors:
    product_ids += list(train_data[train_data['customer_id'] == int(user_id)]['product_id'])
print('Found {0} neighbor items from these users.'.format(len(product_ids))) 
print(product_ids)

Found 416 neighbor items from these users.
[3278, 993, 5930, 1422, 3430, 1159, 4843, 5410, 5515, 5770, 1598, 684, 3170, 5851, 588, 254, 4455, 2292, 3839, 725, 278, 4170, 4500, 1743, 60, 1968, 5679, 5160, 4361, 2513, 2937, 2580, 2000, 1047, 4865, 5485, 2038, 90, 4510, 1871, 440, 15, 4014, 5448, 4165, 2067, 1097, 1056, 2083, 5411, 4577, 2836, 3336, 5158, 3145, 4499, 1243, 1955, 709, 1089, 3222, 4060, 54, 5, 4453, 2374, 4247, 4047, 4225, 5078, 5553, 5818, 2945, 1594, 659, 4888, 1812, 3980, 1521, 5513, 2521, 4639, 1521, 1320, 1407, 4091, 2185, 2584, 150, 354, 685, 5516, 2880, 4414, 2168, 3917, 4373, 2930, 2434, 2249, 4414, 5227, 3719, 1079, 2976, 1232, 3761, 3903, 3635, 4995, 4844, 450, 373, 54, 3775, 1803, 1469, 2260, 1909, 2956, 4429, 2208, 5437, 3794, 1132, 2863, 2623, 1981, 4619, 2071, 2819, 1909, 4898, 5292, 5931, 3154, 4378, 920, 3675, 1108, 4459, 4699, 4317, 133, 3171, 4601, 3014, 1587, 1224, 2871, 2561, 1183, 4240, 5306, 5748, 297, 723, 3508, 1478, 4898, 1700, 3720, 1902, 2870, 152

In [36]:
users = np.full(len(product_ids), desired_user_id, dtype='int32')
items = np.array(product_ids, dtype='int32')

# print('\nRanking most likely tracks using the NeuMF model...')
# # and predict tracks for my user
predictions = model.predict([users,items],batch_size=100, verbose=0) 
predictions = predictions.tolist()
predictions = [i[0] for i in predictions]
print(predictions)

[0.11051443964242935, 0.27727577090263367, 0.4470125436782837, -0.0019590407609939575, 0.28483012318611145, 0.2221749722957611, 0.21751701831817627, 0.4621361792087555, -0.04736141115427017, -0.1871953308582306, 1.7985248565673828, -0.15864437818527222, 0.10821535438299179, 0.5753039717674255, 1.093839168548584, -0.07587369531393051, 0.28131598234176636, 0.2470594346523285, 0.08769510686397552, -0.25480085611343384, 0.14234985411167145, 0.4366738498210907, 0.4154540002346039, -0.04194723814725876, 0.5082292556762695, 0.25975584983825684, 0.37876734137535095, 0.19457592070102692, 0.601924479007721, 0.5285078883171082, 0.1464928388595581, -0.3355347812175751, 0.30830153822898865, -0.06093849986791611, 0.22512167692184448, 0.4985555410385132, 0.23965159058570862, 0.2499372959136963, 0.14586573839187622, -0.0022869110107421875, 0.25305020809173584, 0.15454377233982086, -0.07892545312643051, 0.13413426280021667, 0.5203214883804321, 1.1663469076156616, 0.4737184941768646, -0.0747252628207206

In [39]:
results_df = pd.DataFrame({'product_id': product_ids, 'star_rating': predictions})
results_df = results_df.sort_values(by='star_rating', ascending=False)
#display recommended products in rating order
results_df

Unnamed: 0,product_id,star_rating
103,1079,4.000418
93,4414,3.695869
100,4414,3.695869
266,2461,3.444919
54,3145,2.798421
...,...,...
19,725,-0.254801
155,297,-0.277917
300,2058,-0.299157
31,2580,-0.335535
