In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error


In [2]:
# Load datasets
train_data = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv')
books_data = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/books.csv')
test_data = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv')

In [3]:
# Create user-item rating matrix
user_item_matrix = train_data.pivot(index='user_id', columns='book_id', values='rating')

# Fill missing values
global_mean = user_item_matrix.stack().mean()
user_item_matrix_filled = user_item_matrix.fillna(global_mean)

# Compute user similarity matrix
user_similarity = cosine_similarity(user_item_matrix_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [4]:
# Define rating prediction function
def predict_rating(user_id, book_id, user_item_matrix, user_similarity_df):
    if user_id not in user_item_matrix.index or book_id not in user_item_matrix.columns:
        return global_mean  # Return global mean if user or book is not in the dataset
    
    user_ratings = user_item_matrix.loc[:, book_id]
    user_similarities = user_similarity_df.loc[user_id] if user_id in user_similarity_df.index else pd.Series(0, index=user_item_matrix.index)
    
    # Filter out users who have rated the book and their similarities
    rated_users = user_ratings[user_ratings.notnull()].index
    similarities = user_similarities[rated_users]
    ratings = user_ratings[rated_users]
    
    if len(rated_users) == 0:
        return global_mean  # Return global mean if no neighbors have rated the book
    
    # Calculate the weighted average rating
    weighted_sum = np.dot(similarities, ratings)
    similarity_sum = np.sum(np.abs(similarities))
    
    if similarity_sum == 0:
        return global_mean  # Avoid division by zero
    
    return weighted_sum / similarity_sum


In [5]:
# Generate predictions for the test dataset
predictions = []
for _, row in test_data.iterrows():
    user_id = row['user_id']
    book_id = row['book_id']
    predicted_rating = predict_rating(user_id, book_id, user_item_matrix, user_similarity_df)
    predictions.append({'id': row['id'], 'rating': predicted_rating})

# Convert to DataFrame format
predictions_df = pd.DataFrame(predictions)

# Save the predictions
predictions_df.to_csv('/kaggle/working/submission.csv', index=False)