In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from surprise import *
from surprise.model_selection import *
from sklearn.decomposition import NMF
from tensorflow import keras
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares 
from scipy.spatial.distance import pdist, squareform
from keras import layers, Input, Model

In [2]:
movies_path = "ml-100k/ml-100k/u.item"
train_path = "ml-100k/ml-100k/u3.base"
test_path = "ml-100k/ml-100k/u3.test"

In [3]:
column_names = ['user_id', 'movie_id', 'rating', 'timestamp']
test_ratings = pd.read_csv(test_path, sep='\t', names=column_names)
train_ratings = pd.read_csv(train_path, sep='\t', names=column_names)
movies = pd.read_csv(movies_path, sep='|', header=None, encoding='latin1')
movies = movies[[0,1]]
movies.columns = ['movie_id', 'title']
print(train_ratings.head())
print(test_ratings.head())
print(movies.head())

   user_id  movie_id  rating  timestamp
0        1         1       5  874965758
1        1         2       3  876893171
2        1         3       4  878542960
3        1         4       3  876893119
4        1         6       5  887431973
   user_id  movie_id  rating  timestamp
0        1         5       3  889751712
1        1        11       2  875072262
2        1        16       5  878543541
3        1        25       4  875071805
4        1        35       1  878542420
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [4]:
user_item_matrix = train_ratings.pivot_table(index='user_id', columns='movie_id', values='rating')
# print(user_item_matrix.head())
user_item_matrix_filled = user_item_matrix.fillna(0)
user_distances = pairwise_distances(user_item_matrix_filled, metric='cosine')
user_similarity = 1-user_distances
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
# print(user_similarity_df.head())
def get_top_k_similar_users(user_id, k = 5):
    similar_users = user_similarity_df[user_id].sort_values(ascending = False)
    similar_users = similar_users.drop(user_id)
    top_k_users = similar_users.head(k).index
    return top_k_users
def predict_ratings(user_id, k = 5):
    top_k_users = get_top_k_similar_users(user_id, k)
    similar_users_ratings = user_item_matrix.loc[top_k_users]

    user_similarity_scores = user_similarity_df.loc[user_id, top_k_users]
    weighted_ratings = similar_users_ratings.mul(user_similarity_scores, axis = 0).sum(axis = 0)

    sum_of_similarity_scores = user_similarity_scores.sum()
    if sum_of_similarity_scores > 0:
        predict_ratings = weighted_ratings / sum_of_similarity_scores
    else:
        predict_ratings = pd.Series(index = weighted_ratings.index)

    rated_items = user_item_matrix.loc[user_id].dropna().index
    predict_ratings = predict_ratings.drop(rated_items, errors = 'ignore')

    predict_ratings = predict_ratings.sort_values(ascending=False)
    return predict_ratings
def evaluate_stats_approach(test_data, k = 5):
    test_data['predicted_rating'] = test_data.apply(lambda row: predict_ratings(row['user_id'], k).get(row['movie_id'], np.nan), axis=1)
    test_data = test_data.dropna()
    rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
    return rmse
rmse_statistical = evaluate_stats_approach(test_ratings)
print(f"RMSE for statistical approach: {rmse_statistical:.4f}")

RMSE for statistical approach: 2.4578


In [5]:
# Normalize the user-item matrix
user_item_matrix_centered = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0).fillna(0)
user_distances = pairwise_distances(user_item_matrix_centered, metric='cosine')
user_similarity = 1 - user_distances
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def get_top_k_similar_users(user_id, k=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(user_id)
    top_k_users = similar_users.head(k).index
    return top_k_users

def predict_ratings(user_id, k=5):
    top_k_users = get_top_k_similar_users(user_id, k)
    similar_users_ratings = user_item_matrix.loc[top_k_users]
    user_similarity_scores = user_similarity_df.loc[user_id, top_k_users]
    weighted_ratings = similar_users_ratings.mul(user_similarity_scores, axis=0).sum(axis=0)
    sum_of_similarity_scores = user_similarity_scores.sum()
    
    if sum_of_similarity_scores > 0:
        predict_ratings = weighted_ratings / sum_of_similarity_scores
    else:
        predict_ratings = pd.Series(index=weighted_ratings.index)
    
    rated_items = user_item_matrix.loc[user_id].dropna().index
    predict_ratings = predict_ratings.drop(rated_items, errors='ignore')
    predict_ratings = predict_ratings.sort_values(ascending=False)
    return predict_ratings

def evaluate_stats_approach(test_data, k=5):
    test_data['predicted_rating'] = test_data.apply(lambda row: predict_ratings(row['user_id'], k).get(row['movie_id'], np.nan), axis=1)
    test_data = test_data.dropna()
    rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
    return rmse

rmse_statistical = evaluate_stats_approach(test_ratings)
print(f"RMSE for statistical approach: {rmse_statistical:.4f}")

RMSE for statistical approach: 2.9208


In [6]:
def get_top_k_similar_users(user_id, k=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(user_id)
    top_k_users = similar_users.head(k).index
    return top_k_users

def predict_ratings(user_id, k=5):
    top_k_users = get_top_k_similar_users(user_id, k)
    similar_users_ratings = user_item_matrix.loc[top_k_users]
    user_similarity_scores = user_similarity_df.loc[user_id, top_k_users]
    weighted_ratings = similar_users_ratings.mul(user_similarity_scores, axis=0).sum(axis=0)
    sum_of_similarity_scores = user_similarity_scores.sum()
    
    if sum_of_similarity_scores > 0:
        predict_ratings = weighted_ratings / sum_of_similarity_scores
    else:
        predict_ratings = pd.Series(index=weighted_ratings.index)
    
    rated_items = user_item_matrix.loc[user_id].dropna().index
    predict_ratings = predict_ratings.drop(rated_items, errors='ignore')
    predict_ratings = predict_ratings.sort_values(ascending=False)
    return predict_ratings

# Hierarchical clustering functions
def find_closest_clusters(distance_matrix):
    # Use np.triu_indices to get upper triangle indices
    i, j = np.triu_indices(distance_matrix.shape[0], k=1)
    min_index = np.argmin(distance_matrix[i, j])
    return i[min_index], j[min_index]

def hierarchical_clustering(data, num_clusters, max_iterations=1000):
    clusters = {i: [i] for i in range(len(data))}
    distance_matrix = squareform(pdist(data, metric='euclidean'))
    
    iteration = 0
    while len(clusters) > num_clusters and iteration < max_iterations:
        print(f"Iteration {iteration}, Number of clusters: {len(clusters)}")
        
        i, j = find_closest_clusters(distance_matrix)
        
        if i not in clusters or j not in clusters:
            print(f"Unexpected state: clusters {i} or {j} not found")
            break
        
        print(f"Merging clusters {i} and {j}")
        clusters[i].extend(clusters[j])
        del clusters[j]
        
        for k in list(clusters.keys()):
            if k != i:
                dist = min(distance_matrix[i, k], distance_matrix[j, k])
                distance_matrix[i, k] = dist
                distance_matrix[k, i] = dist
        
        distance_matrix[j, :] = np.inf
        distance_matrix[:, j] = np.inf
        
        iteration += 1
    
    if iteration == max_iterations:
        print("Warning: Maximum iterations reached. Clustering may be incomplete.")
    
    return clusters

import time

def predict_ratings_cluster_optimized(user_ids, movie_ids, clusters, user_item_matrix, default_rating=3.0, batch_size=1000):
    user_to_cluster = {}
    for cluster_id, users in clusters.items():
        for user in users:
            user_to_cluster[user] = cluster_id

    predictions = np.full(len(user_ids), default_rating)
    
    unique_clusters = set(user_to_cluster.values())
    total_clusters = len(unique_clusters)
    
    for i, cluster_id in enumerate(unique_clusters):
        if i % 10 == 0:  # Print progress every 10 clusters
            print(f"Processing cluster {i+1}/{total_clusters}")
        
        cluster_mask = np.array([user_to_cluster.get(user, -1) == cluster_id for user in user_ids])
        if not np.any(cluster_mask):
            continue
        
        cluster_users = [user for user, c_id in user_to_cluster.items() if c_id == cluster_id]
        cluster_users = [u for u in cluster_users if u in user_item_matrix.index]
        
        if not cluster_users:
            continue
        
        cluster_ratings = user_item_matrix.loc[cluster_users]
        mean_ratings = cluster_ratings.mean(axis=0)
        
        # Process in batches
        mask_indices = np.where(cluster_mask)[0]
        for start in range(0, len(mask_indices), batch_size):
            end = start + batch_size
            batch_indices = mask_indices[start:end]
            batch_movies = movie_ids[batch_indices]
            batch_predictions = mean_ratings.reindex(batch_movies).fillna(default_rating).values
            predictions[batch_indices] = batch_predictions

    return predictions

def evaluate_cluster_approach_optimized(test_data, clusters, user_item_matrix, default_rating=3.0, batch_size=1000):
    print("Starting evaluation...")
    start_time = time.time()

    user_ids = test_data['user_id'].values
    movie_ids = test_data['movie_id'].values
    
    print("Predicting ratings...")
    predicted_ratings = predict_ratings_cluster_optimized(user_ids, movie_ids, clusters, user_item_matrix, default_rating, batch_size)
    
    print("Calculating RMSE...")
    test_data['predicted_rating'] = predicted_ratings
    rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
    
    print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")
    return rmse

# Number of clusters
num_clusters = 2

# Performing hierarchical clustering on the user-item matrix
clusters = hierarchical_clustering(user_item_matrix.values, num_clusters)
# Evaluate the hierarchical clustering approach
rmse_cluster = evaluate_cluster_approach_optimized(test_ratings, clusters, user_item_matrix)
print(f"RMSE for hierarchical clustering approach: {rmse_cluster:.4f}")

Iteration 0, Number of clusters: 943
Merging clusters 0 and 1
Iteration 1, Number of clusters: 942
Merging clusters 0 and 2
Iteration 2, Number of clusters: 941
Merging clusters 0 and 3
Iteration 3, Number of clusters: 940
Merging clusters 0 and 4
Iteration 4, Number of clusters: 939
Merging clusters 0 and 5
Iteration 5, Number of clusters: 938
Merging clusters 0 and 6
Iteration 6, Number of clusters: 937
Merging clusters 0 and 7
Iteration 7, Number of clusters: 936
Merging clusters 0 and 8
Iteration 8, Number of clusters: 935
Merging clusters 0 and 9
Iteration 9, Number of clusters: 934
Merging clusters 0 and 10
Iteration 10, Number of clusters: 933
Merging clusters 0 and 11
Iteration 11, Number of clusters: 932
Merging clusters 0 and 12
Iteration 12, Number of clusters: 931
Merging clusters 0 and 13
Iteration 13, Number of clusters: 930
Merging clusters 0 and 14
Iteration 14, Number of clusters: 929
Merging clusters 0 and 15
Iteration 15, Number of clusters: 928
Merging clusters 0 an

In [7]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train_ratings[['user_id', 'movie_id', 'rating']], reader)
trainset = train_data.build_full_trainset()
test_data = Dataset.load_from_df(test_ratings[['user_id', 'movie_id', 'rating']], reader)
testset = test_data.construct_testset(test_data.raw_ratings)
svd = SVD()
svd.fit(trainset)

user_id = 1

predictions = svd.test(testset)
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]
rmse_model_based = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
print(f"RMSE for model-based approach (SVD): {rmse_model_based:.4f}")

RMSE for model-based approach (SVD): 0.9351


In [8]:
from surprise.model_selection import GridSearchCV
# the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.04, 0.06]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(train_data)

# best parameters
best_params = gs.best_params['rmse']
print(f"Best parameters: {best_params}")

best_svd = SVD(**best_params)
best_svd.fit(trainset)

predictions = best_svd.test(testset)
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]
rmse_optimized = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
print(f"Optimized RMSE: {rmse_optimized:.4f}")

Best parameters: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.06}
Optimized RMSE: 0.9318


In [9]:
user_ids = train_ratings['user_id'].unique()
movie_ids = train_ratings['movie_id'].unique()

user_id_map = {id: i for i, id in enumerate(user_ids)}
movie_id_map = {id: i for i, id in enumerate(movie_ids)}

# Add a special index for unknown users/movies
unknown_user_id = len(user_id_map)
unknown_movie_id = len(movie_id_map)

# Map IDs, assigning unknown IDs to a special index
train_ratings['user_id_mapped'] = train_ratings['user_id'].map(user_id_map)
train_ratings['movie_id_mapped'] = train_ratings['movie_id'].map(movie_id_map)
test_ratings['user_id_mapped'] = test_ratings['user_id'].map(user_id_map).fillna(unknown_user_id).astype(int)
test_ratings['movie_id_mapped'] = test_ratings['movie_id'].map(movie_id_map).fillna(unknown_movie_id).astype(int)

# Create Neural Collaborative Filtering model
num_users = len(user_ids) + 1  # +1 for unknown users
num_movies = len(movie_ids) + 1  # +1 for unknown movies

user_input = Input(shape=(1,))
user_embedding = layers.Embedding(num_users, 50)(user_input)
user_vec = layers.Flatten()(user_embedding)

movie_input = Input(shape=(1,))
movie_embedding = layers.Embedding(num_movies, 50)(movie_input)
movie_vec = layers.Flatten()(movie_embedding)

concat = layers.Concatenate()([user_vec, movie_vec])
dense = layers.Dense(128, activation='relu')(concat)
output = layers.Dense(1)(dense)

model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mse')

model.fit([train_ratings['user_id_mapped'], train_ratings['movie_id_mapped']], train_ratings['rating'], epochs=5, verbose=1)

predicted_ratings = model.predict([test_ratings['user_id_mapped'], test_ratings['movie_id_mapped']])
rmse_ncf = np.sqrt(mean_squared_error(test_ratings['rating'], predicted_ratings))
print(f"RMSE for NCF approach: {rmse_ncf:.4f}")


Epoch 1/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 773us/step - loss: 2.3536
Epoch 2/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 882us/step - loss: 0.8972
Epoch 3/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 739us/step - loss: 0.8531
Epoch 4/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 728us/step - loss: 0.8213
Epoch 5/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 758us/step - loss: 0.7893
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 571us/step
RMSE for NCF approach: 0.9283
