In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Function to predict user-based rating
def predict_user_based_rating(user_id, book_id, user_item_matrix, user_similarity_df):
    if book_id not in user_item_matrix.columns:
        return np.nan
    
    user_ratings = user_item_matrix.loc[:, book_id]
    user_similarities = user_similarity_df.loc[user_id] if user_id in user_similarity_df.index else pd.Series(0, index=user_item_matrix.index)
    
    rated_users = user_ratings[user_ratings.notnull()].index
    similarities = user_similarities[rated_users]
    ratings = user_ratings[rated_users]
    
    if len(rated_users) == 0:
        return np.nan
    
    weighted_sum = np.dot(similarities, ratings)
    similarity_sum = np.sum(np.abs(similarities))
    
    if similarity_sum == 0:
        return np.nan
    
    return weighted_sum / similarity_sum

# Function to predict item-based rating
def predict_item_based_rating(user_id, book_id, user_item_matrix, item_similarity_df):
    if user_id not in user_item_matrix.index:
        return np.nan
    
    item_ratings = user_item_matrix.loc[user_id]
    item_similarities = item_similarity_df.loc[book_id] if book_id in item_similarity_df.index else pd.Series(0, index=user_item_matrix.columns)
    
    rated_items = item_ratings[item_ratings.notnull()].index
    similarities = item_similarities[rated_items]
    ratings = item_ratings[rated_items]
    
    if len(rated_items) == 0:
        return np.nan
    
    weighted_sum = np.dot(similarities, ratings)
    similarity_sum = np.sum(np.abs(similarities))
    
    if similarity_sum == 0:
        return np.nan
    
    return weighted_sum / similarity_sum

# Combine user-based and item-based predictions
def predict_combined_rating(user_id, book_id, user_item_matrix, user_similarity_df, item_similarity_df, alpha=0.5):
    user_based = predict_user_based_rating(user_id, book_id, user_item_matrix, user_similarity_df)
    item_based = predict_item_based_rating(user_id, book_id, user_item_matrix, item_similarity_df)
    
    if np.isnan(user_based) and np.isnan(item_based):
        return np.nan
    elif np.isnan(user_based):
        return item_based
    elif np.isnan(item_based):
        return user_based
    else:
        # Combine using weighted average
        return alpha * user_based + (1 - alpha) * item_based

In [3]:
train_data = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv')

# Create user-item rating matrices using the full training set
user_item_matrix_train = train_data.pivot(index='user_id', columns='book_id', values='rating')
item_user_matrix_train = user_item_matrix_train.T  # Transpose for item-user matrix

# Fill NaN values with 0 only for similarity calculation
user_item_filled_train = user_item_matrix_train.apply(lambda row: row.fillna(row.mean()), axis=1)
# Fill NaN values with book mean (for users)
item_user_filled_train = item_user_matrix_train.apply(lambda row: row.fillna(row.mean()), axis=1)

# Calculate user and item similarity matrices
user_similarity = cosine_similarity(user_item_filled_train)
item_similarity = cosine_similarity(item_user_filled_train)

# Convert to DataFrames for easy indexing
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix_train.index, columns=user_item_matrix_train.index)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix_train.index, columns=item_user_matrix_train.index)

In [4]:
test_data = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv')
predictions = []
alpha_value = 0.3  # You can modify this value as needed

for _, row in test_data.iterrows():
    user_id = row['user_id']
    book_id = row['book_id']
    predicted_rating = predict_combined_rating(user_id, book_id, user_item_matrix_train, user_similarity_df, item_similarity_df, alpha=alpha_value)
    
    if np.isnan(predicted_rating):  # Handle missing predictions with a fallback
        predicted_rating = train_data['rating'].mean()  # Use the global mean rating as a fallback
    
    predictions.append({'id': row['id'], 'rating': predicted_rating})

# Convert to DataFrame format and save as submission file
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv('/kaggle/working/submission.csv', index=False)

print("The submission file 'submission.csv' has been created.")

The submission file 'submission.csv' has been created.
