In [41]:
import pandas as pd
import numpy as np


def ndcg_at_k(recommended_items, relevant_items, k):
    dcg = 0
    idcg = 0
    for i in range(k):
        if i < len(recommended_items):
            if recommended_items[i] in relevant_items:
                dcg += 1 / np.log2(i + 2)
        if i < len(relevant_items):
            idcg += 1 / np.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0

def precision_at_k(recommended_items, relevant_items, k):
    num_hits = 0
    for item in recommended_items[:k]:
        if item in relevant_items:
            num_hits += 1
    return num_hits / k

In [26]:
user_df = pd.read_csv("ml-100k/u.user", sep="|", header=None, names=['user_id','age','gender','occupation','zip_code'],encoding='latin1')
ratings_df = pd.read_csv("ml-100k/u.data", sep="\t", header=None, names=["user_id", "movie_id", "rating", "timestamp"], encoding='latin1')
items_df = pd.read_csv("ml-100k/u.item", sep="|", header=None, names=["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"], encoding='latin1')

In [27]:
user_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [28]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [29]:
items_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.prediction_algorithms.knns import ndcg_at_k, precision_at_k


merged_df = pd.merge(ratings_df, user_df, on='user_id')

# Initialize a Reader object for parsing the ratings data
reader = Reader(rating_scale=(1, 5))

# Load the merged DataFrame into a Surprise Dataset
data = Dataset.load_from_df(merged_df[['user_id', 'movie_id', 'rating']], reader)
# Load data into Surprise Dataset with additional user information

# Use user-based collaborative filtering with cosine similarity
sim_options = {
    'name': 'pearson',
    'user_based': True
}

# Instantiate the KNNBasic algorithm
model = KNNBasic(sim_options=sim_options)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model on the trainset
model.fit(trainset)

# Make predictions on the testset
predictions = model.test(testset)

# Compute RMSE and MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

# Define a function to compute NDCG@k and Precision@k
def ndcg_and_precision_at_k(predictions, k):
    ndcg_sum = 0
    precision_sum = 0
    total_users = len(set([pred.uid for pred in predictions]))
    for uid in set([pred.uid for pred in predictions]):
        # Get top-K recommended items for the user
        user_recs = [pred for pred in predictions if pred.uid == uid]
        user_recs.sort(key=lambda x: x.est, reverse=True)  # Sort predictions by estimated rating
        top_k_recs = [pred.iid for pred in user_recs[:k]]  # Get top-K recommended items
        # Get the ground truth items for the user from the test set
        true_items = [item_id for (_, item_id, _) in testset if uid == item_id]
        # Calculate NDCG@k
        ndcg_sum += ndcg_at_k(top_k_recs, true_items, k)
        # Calculate Precision@k
        precision_sum += precision_at_k(top_k_recs, true_items, k)
    ndcg_at_k_avg = ndcg_sum / total_users
    precision_at_k_avg = precision_sum / total_users
    return ndcg_at_k_avg, precision_at_k_avg

# Compute NDCG@k and Precision@k
k = 10  # Top-K recommendations
ndcg_at_k_avg, precision_at_k_avg = ndcg_and_precision_at_k(predictions, k)

# Print evaluation metrics
# print("RMSE:", rmse)
# print("MAE:", mae)
print("NDCG@{}:".format(k), ndcg_at_k_avg)
print("Precision@{}:".format(k), precision_at_k_avg)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0116
MAE:  0.8067
NDCG@10: 0.0005134432311275135
Precision@10: 0.0006382978723404255


<h3> Improved Collaborative Filtering </h3> 

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Merge dataframes
merged_df = pd.merge(ratings_df, user_df, on='user_id')

# Split the data into train and test sets
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

# Create user-item matrix
user_item_matrix = train_data.pivot_table(index='user_id', columns='movie_id', values='rating')

# Fill NaN values with 0
user_item_matrix = user_item_matrix.fillna(0)

# Compute cosine similarity
user_similarity = cosine_similarity(user_item_matrix)

# Function to predict ratings
def predict_ratings(user_id, movie_id):
    if user_id not in user_item_matrix.index or movie_id not in user_item_matrix.columns:
        return 0
    user_ratings = user_item_matrix.loc[user_id]
    similar_users = user_similarity[user_id - 1]
    weighted_sum = 0
    similarity_sum = 0
    for other_user_id, similarity in enumerate(similar_users):
        if other_user_id + 1 != user_id and user_item_matrix.loc[other_user_id + 1, movie_id] != 0:
            weighted_sum += similarity * user_item_matrix.loc[other_user_id + 1, movie_id]
            similarity_sum += similarity
    if similarity_sum == 0:
        return 0
    return weighted_sum / similarity_sum

# Test the model
def evaluate_model(test_data):
    predictions = []
    for _, row in test_data.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        rating = row['rating']
        predicted_rating = predict_ratings(user_id, movie_id)
        predictions.append((user_id, movie_id, rating, predicted_rating))
    return predictions

# Compute predictions
predictions = evaluate_model(test_data)

# Compute RMSE, MAE, NDCG@k, and Precision@k
# Add your code to compute these metrics based on the predictions


In [61]:
# Function to compute NDCG@k and Precision@k
def compute_ndcg_and_precision(predictions, k):
    ndcg_sum = 0
    precision_sum = 0
    total_users = len(set([user_id for user_id, _, _, _ in predictions]))
    for user_id in set([user_id for user_id, _, _, _ in predictions]):
        user_predictions = [pred for pred in predictions if pred[0] == user_id]
        user_predictions.sort(key=lambda x: x[3], reverse=True)  # Sort predictions by predicted rating
        top_k_predictions = user_predictions[:k]  # Get top-K predicted ratings
        true_items = [pred[1] for pred in user_predictions]  # Ground truth items
        if len(true_items) >= k:  # Check if the length of true_items is greater than or equal to k
            # Calculate NDCG@k
            ideal_dcg = sum([true_items[i] / np.log2(i + 2) for i in range(min(k, len(true_items)))])
            actual_dcg = sum([true_items[i] / np.log2(i + 2) for i in range(k)])
            ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
            ndcg_sum += ndcg
            # Calculate Precision@k
            top_k_predicted_items = [pred[1] for pred in top_k_predictions]
            relevant_items = set(true_items)
            precision = len(set(top_k_predicted_items) & relevant_items) / k
            precision_sum += precision
    ndcg_avg = ndcg_sum / total_users
    precision_avg = precision_sum / total_users
    return ndcg_avg, precision_avg

# Define K for NDCG@k and Precision@k
k = 10

# Compute NDCG@k and Precision@k
ndcg_at_k, precision_at_k = compute_ndcg_and_precision(predictions, k)

# Print the evaluation metrics
print("NDCG@{}:".format(k), ndcg_at_k)
print("Precision@{}:".format(k), precision_at_k)


NDCG@10: 0.6167728237791932
Precision@10: 0.6167728237791932
