In [40]:
# Import our regular old heroes
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficiency.
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convenient item retrieval during iteration
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [41]:
anime_ratings = pd.read_csv('train_cleaned.csv')
anime_ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,11617.0,10.0
1,1,11757.0,10.0
2,1,15451.0,10.0
3,2,11771.0,10.0
4,3,20.0,8.0


 Normalize Ratings Using MinMaxScaler

In [42]:
from sklearn.preprocessing import MinMaxScaler

# Assuming your DataFrame has columns 'user_id', 'anime_id', and 'rating'
scaler = MinMaxScaler()

# Normalize the 'rating' column to a range [0, 1]
anime_ratings['rating'] = scaler.fit_transform(anime_ratings[['rating']])

Create a User-Item Interaction Matrix:

Format your data into a matrix where rows represent users, columns represent items, and values represent ratings.

In [43]:
# Example: Creating a user-item interaction matrix
user_item_matrix = anime_ratings.pivot(index='user_id', columns='anime_id', values='rating').fillna(0)


Using scikit-surprise Library:

This library simplifies collaborative filtering with different algorithms.

In [4]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357245 sha256=ad8d6a038df2e77e11e31303ab09d90a2c746653ea5746ef93c40536efcb00a4
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

Build the Collaborative Filtering Model

In [44]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNBasic, SVD, SVDpp

# Define the data format
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(anime_ratings[['user_id', 'anime_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.20, random_state=42)


Choose a Collaborative Filtering Algorithm:

Matrix Factorization (SVD)

In [45]:
from surprise import SVD

model_svd = SVD()
model_svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f73d6e9db70>

Make Predictions with Collaborative Filtering

In [46]:
predictions_svd = model_svd.test(testset)

# Evaluate the model
from surprise import accuracy

print(f"SVD RMSE: {accuracy.rmse(predictions_svd)}")

RMSE: 0.3017
SVD RMSE: 0.30168452344698987


In [39]:
test_data = pd.read_csv('test_cleaned.csv')
test_data.head()

Unnamed: 0,user_id,anime_id
0,40763,21405
1,68791,10504
2,40487,1281
3,55290,165
4,72323,11111


In [48]:
# Ensure the user_id and anime_id columns are in the correct data types
test_data['user_id'] = test_data['user_id'].astype(int)
test_data['anime_id'] = test_data['anime_id'].astype(int)

# Convert the test DataFrame to Surprise format
testset = [(row['user_id'], row['anime_id'], None) for index, row in test_data.iterrows()]

# Generate predictions
predictions_svd = model_svd.test(testset)

In [54]:
# Extract relevant information from predictions
def extract_predictions(predictions):
    pred_list = []
    for uid, iid, true_r, est, _ in predictions:
        pred_list.append((uid, iid, est))
    return pred_list

df_predictions_svd = pd.DataFrame(extract_predictions(predictions_svd), columns=['user_id', 'anime_id', 'predicted_rating'])

# Create the 'ID' column
df_predictions_svd['ID'] = df_predictions_svd['user_id'].astype(str) + '_' + df_predictions_svd['anime_id'].astype(str)

# Assuming min_rating and max_rating were used during training
min_rating = scaler.data_min_[0]
max_rating = scaler.data_max_[0]

# Normalize the predictions back to the original scale
df_predictions_svd['rating'] = df_predictions_svd['predicted_rating'] * (max_rating - min_rating) + min_rating



In [56]:
# Select only the required columns
submission_df = df_predictions_svd[['ID', 'rating']]

# Display the first few rows of the submission DataFrame
submission_df.head()


Unnamed: 0,ID,rating
0,40763_21405,10.0
1,68791_10504,10.0
2,40487_1281,10.0
3,55290_165,10.0
4,72323_11111,10.0


In [57]:
# Save to CSV
submission_df.to_csv('svd_predictions_submission.csv', index=False)

Parameter Tuning

SVD Tuning

For SVD, you can tune parameters such as n_factors (number of latent factors), n_epochs (number of iterations), and lr_all (learning rate).

In [58]:
from surprise.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1, 0.4]
}

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

# Print the best score and the corresponding hyperparameters
print(f'Best RMSE: {gs.best_score["rmse"]}')
print(f'Best hyperparameters: {gs.best_params["rmse"]}')




In [None]:
# Use the best model
best_model = gs.best_estimator['rmse']
best_model.fit(trainset)

# Generate predictions using the best model
predictions_svd = best_model.test(testset)

# Evaluate the model
print(f"SVD RMSE after tuning: {accuracy.rmse(predictions_svd)}")