In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
train = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv')  # book_id, user_id, rating
test = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv')    # id, book_id, user_id

# Create a pivot table (book-user matrix)
ratings_matrix = train.pivot_table(index='book_id', columns='user_id', values='rating')

# Fill missing values with item mean
item_mean_ratings = ratings_matrix.mean(axis=1)
ratings_matrix_filled = ratings_matrix.apply(lambda x: x.fillna(item_mean_ratings[x.name]), axis=1)

# Compute item-based cosine similarity on the matrix filled with item means
item_similarity = cosine_similarity(ratings_matrix_filled)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

# Compute item-based cosine similarity on the sparse matrix
#item_similarity = cosine_similarity(ratings_matrix.fillna(0))
#item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

# Function to predict rating using item-based collaborative filtering
def predict_rating(user_id, book_id, ratings_matrix, item_similarity):
    # Get the user's ratings
    user_ratings = ratings_matrix.loc[:, user_id]
    rated_books = user_ratings[user_ratings > 0].index  # Books the user rated

    # Get similarities with the target book
    similarities = item_similarity_df.loc[book_id, rated_books]
    ratings = user_ratings[rated_books]

    # Compute weighted average
    weighted_sum = np.dot(similarities, ratings)
    sim_sum = np.abs(similarities).sum()

    # Return prediction
    return weighted_sum / sim_sum if sim_sum > 0 else ratings.mean()


# Predict ratings for all test entries
predictions = []

for _, row in test.iterrows():
    book_id = row['book_id']
    user_id = row['user_id']
    pred_rating = predict_rating(user_id, book_id, ratings_matrix_filled, item_similarity_df)
    predictions.append({'id': row['id'], 'rating': pred_rating})

# Create a DataFrame with predictions and save to CSV
submission = pd.DataFrame(predictions)
submission.to_csv('/kaggle/working/submission.csv', index=False)
