In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout


In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], True)
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# print(gpus)
# tf.config.set_visible_devices([], 'CPU') # hide the CPU
# tf.config.set_visible_devices(gpus[0], 'GPU') # unhide potentially hidden GPU
# tf.config.get_visible_devices()

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout

In [6]:
ratings_df = pd.read_csv('../data/jester_ratings.csv')
jokes_df = pd.read_csv('../data/jester_items.csv')
display(ratings_df[:5]), display(jokes_df[:5])

Unnamed: 0,userId,jokeId,rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281
3,1,13,-6.781
4,1,15,0.875


Unnamed: 0,jokeId,jokeText
0,1,"A man visits the doctor. The doctor says ""I ha..."
1,2,This couple had an excellent relationship goin...
2,3,Q. What's 200 feet long and has 4 teeth? \r\n\...
3,4,Q. What's the difference between a man and a t...
4,5,Q.\tWhat's O. J. Simpson's Internet address? \...


(None, None)

In [7]:
user_encoder = LabelEncoder()
joke_encoder = LabelEncoder()

In [8]:
ratings_df['userId'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['jokeId'] = joke_encoder.fit_transform(ratings_df['jokeId'])
display(ratings_df[:5])

Unnamed: 0,userId,jokeId,rating
0,0,0,0.219
1,0,1,-9.281
2,0,2,-9.281
3,0,3,-6.781
4,0,4,0.875


In [9]:
num_users = ratings_df['userId'].nunique()
num_jokes = ratings_df['jokeId'].nunique()
num_users, num_jokes

(59132, 140)

In [10]:
# Train-test split
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [17]:
from tensorflow.keras.regularizers import l2

# Model Definition
def create_ncf_model(num_users, num_jokes, embedding_dim=200):
    user_input = Input(shape=(1,), name='user_input')
    joke_input = Input(shape=(1,), name='joke_input')

    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
    joke_embedding = Embedding(input_dim=num_jokes, output_dim=embedding_dim, name='joke_embedding')(joke_input)

    user_vector = Flatten()(user_embedding)
    joke_vector = Flatten()(joke_embedding)

    concatenated = Concatenate()([user_vector, joke_vector])

    x = Dense(128, activation='relu',kernel_regularizer=l2(0.01))(concatenated)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu',kernel_regularizer=l2(0.01))(x)
    x = Dropout(0.5)(x)
    output = Dense(1)(x)

    model = Model(inputs=[user_input, joke_input], outputs=output)
    model.compile(optimizer='adam', loss='mse')

    return model

In [18]:
ncf_model = create_ncf_model(num_users, num_jokes)
ncf_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 joke_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 200)       11826400    ['user_input[0][0]']             
                                                                                                  
 joke_embedding (Embedding)     (None, 1, 200)       28000       ['joke_input[0][0]']             
                                                                                            

In [19]:
tf.config.get_visible_devices()


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [20]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

early_stopping

<keras.callbacks.EarlyStopping at 0x1b4027bb970>

In [21]:
with tf.device('/GPU:0'):
    history = ncf_model.fit(
    [train_df['userId'], train_df['jokeId']],
    train_df['rating'],
    validation_data=([test_df['userId'], test_df['jokeId']], test_df['rating']),
    epochs=10,
    batch_size=1024,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Inference
# def recommend_jokes_for_user(user_id, joke_ids, model, top_n=5):
#     user_encoded = user_encoder.transform([user_id])
#     joke_encoded = joke_encoder.transform(joke_ids)
    
#     predictions = model.predict([np.array([user_encoded] * len(joke_encoded)), np.array(joke_encoded)])
#     recommended_jokes = sorted(zip(joke_ids, predictions), key=lambda x: x[1], reverse=True)
    
#     return recommended_jokes[:top_n]


def recommend_jokes_for_user(user_id, joke_ids, model, top_n=5):
    # Check if user_id exists in the LabelEncoder
    if user_id not in user_encoder.classes_:
        print(f"User ID {user_id} not seen in training. Cannot make recommendations.")
        return []

    # Encode user_id
    user_encoded = user_encoder.transform([user_id])[0]
    
    # Encode joke_ids and handle unseen jokes
    encoded_jokes = []
    for joke_id in joke_ids:
        if joke_id in joke_encoder.classes_:
            encoded_jokes.append(joke_encoder.transform([joke_id])[0])
        else:
            print(f"Joke ID {joke_id} not seen in training. Skipping.")
            continue
    
    # If all jokes are unseen, return an empty list
    if not encoded_jokes:
        return []

    # Predict ratings for the encoded jokes
    predictions = model.predict([np.array([user_encoded] * len(encoded_jokes)), np.array(encoded_jokes)])
    
    # Pair the jokes with their predictions
    recommended_jokes = sorted(zip(joke_ids, predictions), key=lambda x: x[1], reverse=True)
    
    return recommended_jokes[:top_n]

In [43]:
type(jokes_df.loc[jokes_df['jokeId']==3, 'jokeText']), jokes_df.loc[jokes_df['jokeId']==3, 'jokeText']

(pandas.core.series.Series,
 2    Q. What's 200 feet long and has 4 teeth? \r\n\...
 Name: jokeText, dtype: object)

In [34]:
user_id = 1
available_jokes = jokes_df['jokeId'].values
recommendations = recommend_jokes_for_user(user_id, available_jokes, ncf_model)

# getting the corresponding jokes
for rec in recommendations:
    jokeId = rec[0]
    print(jokes_df.loc[jokes_df['jokeId']==rec, 'jokeText'].iloc[0])


Joke ID 1 not seen in training. Skipping.
Joke ID 2 not seen in training. Skipping.
Joke ID 3 not seen in training. Skipping.
Joke ID 4 not seen in training. Skipping.
Joke ID 6 not seen in training. Skipping.
Joke ID 9 not seen in training. Skipping.
Joke ID 10 not seen in training. Skipping.
Joke ID 11 not seen in training. Skipping.
Joke ID 12 not seen in training. Skipping.
Joke ID 14 not seen in training. Skipping.


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [28]:
# Performance Metrics
from sklearn.metrics import mean_squared_error

test_predictions = ncf_model.predict([test_df['userId'], test_df['jokeId']])
rmse = np.sqrt(mean_squared_error(test_df['rating'], test_predictions))
print(f'RMSE: {rmse:.4f}')

RMSE: 4.1883
