In [1]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print("GPU is available")
    for gpu in gpus:
        print("Name:", gpu.name, "Type:", gpu.device_type)

GPU is available
Name: /physical_device:GPU:0 Type: GPU


In [2]:
import pandas as pd

df_ratings = pd.read_parquet('ratings_data.parquet')

def mapping(x):
    if x >= 4.0:
        return 1
    else:
        return 0

df_ratings['implicit_feedback'] = df_ratings['rating'].apply(mapping)

df_ratings = df_ratings[df_ratings['implicit_feedback'] == 1].reset_index(drop=True)

# Filtering users who only have more than 20 positive interaction

positive_interaction_per_user = df_ratings[df_ratings['implicit_feedback'] == 1] \
                                .groupby('user_id')['movie_id'].count()

rare_users = positive_interaction_per_user[positive_interaction_per_user < 20] \
                                    .index.tolist()

df_ratings = df_ratings[~df_ratings['user_id'].isin(rare_users)]

# Filtering out rare movies

movie_interaction_per_user = df_ratings.groupby('movie_id')['user_id'].count()

rare_movies = movie_interaction_per_user[movie_interaction_per_user < 4] \
                                    .index.tolist()

df_ratings = df_ratings[~df_ratings['movie_id'].isin(rare_movies)]

# Encoding user and movie id's to a continous scale as expectd by NCF
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_ratings['user_id_encoded'] = user_encoder.fit_transform(df_ratings['user_id'])
df_ratings['movie_id_encoded'] = movie_encoder.fit_transform(df_ratings['movie_id'])


# no of unique users in our data

print('No of unique users:', df_ratings['user_id_encoded'].nunique())

# no of unique movies in our data

print('No of unique movies:', df_ratings['movie_id_encoded'].nunique())

No of unique users: 51728
No of unique movies: 9029


In [3]:
# Creating val-dataset

import pandas as pd

val_df = df_ratings.groupby('user_id_encoded', group_keys=False).sample(1, random_state=42)

val_indices = val_df.index

train_df = df_ratings.drop(val_indices).reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [4]:
# Add Negative Sampling for users for movies they have not interacted with in train data 

user_rated_movies = train_df.groupby('user_id_encoded')['movie_id_encoded'].apply(set).to_dict()
all_movies = set(train_df['movie_id_encoded'].unique())

from tqdm import tqdm
import numpy as np
import math

NEGATIVE_RATIO = 3
neg_samples = []

for user in tqdm(train_df['user_id_encoded'].unique(), desc="Users Sampling"):
    user_watched_movies = user_rated_movies.get(user, set())
    n_positives = len(user_watched_movies)
    n_negatives = n_positives * NEGATIVE_RATIO
    candidates = np.array(list(all_movies - user_watched_movies))
    if len(candidates) == 0 or n_negatives == 0:
        continue
    n_samples = min(n_negatives, len(candidates))
    neg_movies = np.random.choice(candidates, size=n_samples, replace=False)
    neg_samples.append(
        pd.DataFrame({
            'user_id_encoded': [user]*n_samples,
            'movie_id_encoded': neg_movies,
            'implicit_feedback': [0]*n_samples
        })
    )

df_negatives = pd.concat(neg_samples, ignore_index=True)

Users Sampling: 100%|██████████| 51728/51728 [00:53<00:00, 959.66it/s] 


In [5]:
# Data with positive + negative samples

train_data_final = pd.concat([
    train_df[['user_id_encoded', 'movie_id_encoded' ,'implicit_feedback']],
    df_negatives
], ignore_index=True)

In [6]:
NEGATIVE_RATIO = 2

# Create validation dataset with negatives
val_neg_samples = []
for user in tqdm(val_df['user_id_encoded'].unique(), desc='creating val users'):
    user_watched_movies = user_rated_movies.get(user, set()) | set(val_df[val_df['user_id_encoded'] == user]['movie_id_encoded'])
    candidates = list(all_movies - user_watched_movies)
    if not candidates:
        continue
    n_negatives = NEGATIVE_RATIO  # 2 negatives per positive
    n_samples = min(n_negatives, len(candidates))
    neg_movies = np.random.choice(candidates, size=n_samples, replace=False)
    val_neg_samples.append(
        pd.DataFrame({
            'user_id_encoded': [user] * n_samples,
            'movie_id_encoded': neg_movies,
            'implicit_feedback': [0] * n_samples
        })
    )

df_val_negatives = pd.concat(val_neg_samples, ignore_index=True) if val_neg_samples else pd.DataFrame()
val_df_with_neg = pd.concat([
    val_df[['user_id_encoded', 'movie_id_encoded', 'implicit_feedback']],
    df_val_negatives
], ignore_index=True)


creating val users: 100%|██████████| 51728/51728 [01:23<00:00, 621.44it/s]


In [None]:
import tensorflow as tf 
# Preparing validation dataset for loss

val_users = np.array(val_df_with_neg['user_id_encoded'], dtype=np.int32)
val_movies = np.array(val_df_with_neg['movie_id_encoded'], dtype=np.int32)
val_labels = np.array(val_df_with_neg['implicit_feedback'], dtype=np.float32)

val_dataset = tf.data.Dataset.from_tensor_slices(
    ((val_users, val_movies), val_labels)
)
val_dataset = val_dataset.batch(batch_size=256).prefetch(tf.data.AUTOTUNE)

# Verify validation data
print("Validation samples with negatives:", len(val_df_with_neg))
print("Validation data balance:\n", val_df_with_neg['implicit_feedback'].value_counts())
val_movies_missing = set(val_df['movie_id_encoded'].unique()) - all_movies
print("Validation movies missing in training:", len(val_movies_missing))

In [None]:
# import tensorflow as tf
import numpy as np
# Prepare training dataset
train_users = np.array(train_data_final['user_id_encoded'])
train_movies = np.array(train_data_final['movie_id_encoded'])
train_labels = np.array(train_data_final['implicit_feedback'])

train_dataset = tf.data.Dataset.from_tensor_slices(
    ((train_users, train_movies), train_labels)
)
train_dataset = train_dataset.shuffle(buffer_size=100000, seed=42)
train_dataset = train_dataset.batch(batch_size=256).prefetch(tf.data.AUTOTUNE)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers, regularizers, optimizers

num_users = train_data_final['user_id_encoded'].nunique()  # ~51728
num_items = train_data_final['movie_id_encoded'].nunique()  # ~9029


embedding_size = 32
l2_reg = 1e-4

user_input = layers.Input(shape=(), name='user_id_encoded', dtype=tf.int32)
movie_input = layers.Input(shape=(), name='movie_id_encoded', dtype=tf.int32)

user_embedding = layers.Embedding(num_users, embedding_size, name='user_embeddings',
                                 embeddings_regularizer=regularizers.l2(l2_reg))(user_input)
movie_embedding = layers.Embedding(num_items, embedding_size, name='movie_embeddings',
                                  embeddings_regularizer=regularizers.l2(l2_reg))(movie_input)

interaction = layers.Concatenate()([user_embedding, movie_embedding])

x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(interaction)
x = layers.Dropout(0.4)(x)
x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(x)
output = layers.Dense(1, activation='sigmoid', name='output')(x)

model = Model(inputs=[user_input, movie_input], outputs=output)


optimizer = optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model
train_users = np.array(train_data_final['user_id_encoded'], dtype=np.int32)
train_movies = np.array(train_data_final['movie_id_encoded'], dtype=np.int32)
train_labels = np.array(train_data_final['implicit_feedback'], dtype=np.float32)

val_users = np.array(val_df_with_neg['user_id_encoded'], dtype=np.int32)
val_movies = np.array(val_df_with_neg['movie_id_encoded'], dtype=np.int32)
val_labels = np.array(val_df_with_neg['implicit_feedback'], dtype=np.float32)

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint('new_best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1)

history = model.fit(
    [train_users, train_movies], train_labels,
    validation_data=([val_users, val_movies], val_labels),
    epochs=6, batch_size=256, verbose=1,
    callbacks=[checkpoint, early_stop]
)


model.summary()

# Predict scores
all_scores = model.predict([train_users, train_movies], batch_size=256, verbose=1).flatten()

# Add to DataFrame
train_data_final['ncf_score_2'] = all_scores

# Check score distribution
print("Score statistics:")
print("Max:", train_data_final['ncf_score_2'].max())
print("Mean:", train_data_final['ncf_score_2'].mean())
print("Positives mean:", train_data_final[train_data_final['implicit_feedback'] == 1]['ncf_score_2'].mean())
print("Negatives mean:", train_data_final[train_data_final['implicit_feedback'] == 0]['ncf_score_2'].mean())

# Subset test
subset = train_data_final.sample(1000, random_state=42)
user_ids_subset = np.array(subset['user_id_encoded'], dtype=np.int32)
movie_ids_subset = np.array(subset['movie_id_encoded'], dtype=np.int32)
scores_subset = model.predict([user_ids_subset, movie_ids_subset], batch_size=256, verbose=1).flatten()
print("Subset max score:", scores_subset.max())
print("Subset positives mean:", scores_subset[subset['implicit_feedback'] == 1].mean())
print("Subset negatives mean:", scores_subset[subset['implicit_feedback'] == 0].mean())

# Debug logits (pre-sigmoid)
logit_model = Model(inputs=model.inputs, outputs=model.get_layer('output').get_output_at(0))
logit_model.layers[-1].activation = None  # Remove sigmoid
logits_subset = logit_model.predict([user_ids_subset, movie_ids_subset], batch_size=256, verbose=1).flatten()
print("Logits min:", logits_subset.min(), "max:", logits_subset.max())

# Manually apply sigmoid
manual_scores = 1 / (1 + np.exp(-logits_subset))
print("Manual sigmoid max:", manual_scores.max())
print("Manual sigmoid positives mean:", manual_scores[subset['implicit_feedback'] == 1].mean())
print("Manual sigmoid negatives mean:", manual_scores[subset['implicit_feedback'] == 0].mean())

# # Verify evaluation
# hr, ndcg = evaluate_model(model, val_df, user_rated_movies, all_movies, max_users=1000, num_negatives=100)
# print(f"Test HR@10: {hr:.4f}, NDCG@10: {ndcg:.4f}")


Num users: 51728 Num items: 9029
Original movies: 9029
Training movies: 9029
Final training movies: 9029
All movies set: 9029
Epoch 1/6
Epoch 1: val_loss improved from inf to 0.33200, saving model to new_best_model.h5
Epoch 2/6
Epoch 2: val_loss improved from 0.33200 to 0.32285, saving model to new_best_model.h5
Epoch 3/6
Epoch 3: val_loss improved from 0.32285 to 0.30861, saving model to new_best_model.h5
Epoch 4/6
Epoch 4: val_loss improved from 0.30861 to 0.29900, saving model to new_best_model.h5
Epoch 5/6
Epoch 5: val_loss improved from 0.29900 to 0.29618, saving model to new_best_model.h5
Epoch 6/6
Epoch 6: val_loss did not improve from 0.29618
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_id_encoded (InputLayer)   [(None,)]            0           []                               
                             

In [8]:
from tensorflow.keras.models import load_model

model = load_model('new_best_model.h5')


# Evaluation 

In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model

# Verify data consistency
val_movies = set(val_df['movie_id_encoded'].unique())
all_movies = set(train_data_final['movie_id_encoded'].unique())
missing_movies = val_movies - all_movies

user_rated_movies = train_df.groupby('user_id_encoded')['movie_id_encoded'].apply(set).to_dict()
skipped_users = sum(1 for user in val_df['user_id_encoded'].unique() if not (all_movies - user_rated_movies.get(user, set())))

# Optimized evaluation function
def evaluate_model(model, val_df, user_rated_movies, all_movies, num_negatives=50, k=10, max_users=None):
    # Filter valid users
    val_df = val_df[val_df['movie_id_encoded'].isin(all_movies)]

    if val_df.empty:
        print("No valid validation users after filtering.")
        return 0.0, 0.0

    hr, ndcg = [], []
    users_to_eval = val_df['user_id_encoded'].unique()

    if max_users:
        users_to_eval = users_to_eval[:max_users]
    batch_size = 1000
    skipped_users = 0

    for start in range(0, len(users_to_eval), batch_size):
        batch_users = users_to_eval[start:start + batch_size]
        batch_users_list, batch_items_list = [], []
        batch_pos_items, batch_neg_items = [], []

        for user in batch_users:
            pos_item = val_df[val_df['user_id_encoded'] == user]['movie_id_encoded'].values[0]
            non_rated = list(all_movies - user_rated_movies.get(user, set()))
            if not non_rated:
                skipped_users += 1
                continue
            neg_items = np.random.choice(non_rated, size=min(num_negatives, len(non_rated)), replace=False)
            items = np.array([pos_item] + list(neg_items))
            users = np.array([user] * len(items))
            
            batch_users_list.extend(users)
            batch_items_list.extend(items)
            batch_pos_items.append(pos_item)
            batch_neg_items.append(neg_items)

        if not batch_users_list:
            continue

        scores = model.predict(
            [np.array(batch_users_list, dtype=np.int32), np.array(batch_items_list, dtype=np.int32)],
            verbose=0,
            batch_size=256
        ).flatten()

        idx = 0

        for i, pos_item in enumerate(batch_pos_items):
            num_items = num_negatives + 1
            user_scores = scores[idx:idx + num_items]
            top_k_indices = np.argsort(user_scores)[::-1][:k]
            items = np.array([pos_item] + list(batch_neg_items[i]))

            top_k_items = items[top_k_indices]
            hr.append(int(pos_item in top_k_items))

            if pos_item in top_k_items:
                rank = np.where(top_k_items == pos_item)[0][0]
                ndcg.append(1.0 / np.log2(rank + 2))

            else:
                ndcg.append(0.0)

            idx += num_items

    if skipped_users:
        print(f"Skipped {skipped_users} users with no non-rated items.")

    return np.mean(hr) if hr else 0.0, np.mean(ndcg) if ndcg else 0.0

# Prepare evaluation data
user_rated_movies = train_df.groupby('user_id_encoded')['movie_id_encoded'].apply(set).to_dict()
all_movies = set(train_data_final['movie_id_encoded'].unique())


Validation movies missing in training: 0
Users with no non-rated items: 0


In [13]:
# Verify evaluation for all users
hr, ndcg = evaluate_model(model, val_df, user_rated_movies, all_movies, max_users=None, num_negatives=100)
print(f"Test HR@10: {hr:.4f}, NDCG@10: {ndcg:.4f}")

Test HR@10: 0.8674, NDCG@10: 0.5948


In [14]:
# Verify evaluation for random 1000 users
hr, ndcg = evaluate_model(model, val_df, user_rated_movies, all_movies, max_users=1000, num_negatives=100)
print(f"Test HR@10: {hr:.4f}, NDCG@10: {ndcg:.4f}")

Test HR@10: 0.8790, NDCG@10: 0.5930
