In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, SVDpp, KNNWithMeans, SlopeOne
from surprise.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
print("Loading data...")
train_df = pd.read_csv('train.csv')
movies_df = pd.read_csv('movies.csv')
test_df = pd.read_csv('test.csv')

Loading data...


In [3]:
# Prepare data for Surprise library
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

In [None]:
# Perform hyperparameter tuning
print("Performing hyperparameter tuning...")
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

Performing hyperparameter tuning...


In [None]:
# Train models
print("Training models...")
svd = gs.best_estimator['rmse']
knn = KNNWithMeans(k=40, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': False})
slope_one = SlopeOne()

trainset = data.build_full_trainset()
svd.fit(trainset)
knn.fit(trainset)
slope_one.fit(trainset)

In [None]:
# Prepare additional features
print("Preparing additional features...")
movies_df['genres'] = movies_df['genres'].fillna('')
tfidf = TfidfVectorizer(token_pattern=r'\b\w+\b')
genre_matrix = tfidf.fit_transform(movies_df['genres'])

global_mean = train_df['rating'].mean()
user_biases = train_df.groupby('userId')['rating'].mean() - global_mean
movie_biases = train_df.groupby('movieId')['rating'].mean() - global_mean

def ensemble_predict(user, item):
    svd_pred = svd.predict(user, item).est
    knn_pred = knn.predict(user, item).est
    slope_one_pred = slope_one.predict(user, item).est
    
    user_bias = user_biases.get(user, 0)
    movie_bias = movie_biases.get(item, 0)
    
    genre_features = genre_matrix[movies_df['movieId'] == item].toarray().flatten()
    genre_weight = 0.1
    
    ensemble_pred = (svd_pred + knn_pred + slope_one_pred) / 3
    final_pred = ensemble_pred + user_bias + movie_bias + genre_weight * np.mean(genre_features)
    
    return max(0.5, min(5.0, final_pred))

In [None]:
# Create test pairs
test_pairs = set(test_df.apply(lambda row: f"{row['userId']}_{row['movieId']}", axis=1))

def get_predictions_batch(test_pairs, batch_size=1000):
    predictions = {}
    for i in tqdm(range(0, len(test_pairs), batch_size)):
        batch_pairs = list(test_pairs)[i:i+batch_size]
        for pair in batch_pairs:
            user, movie = map(int, pair.split('_'))
            est = ensemble_predict(user, movie)
            predictions[pair] = est
    return predictions

In [None]:
# Find optimal batch size
print("Finding optimal batch size...")
batch_sizes = [1000, 5000, 10000, 20000]
optimal_batch_size = 1000
min_time = float('inf')

for batch_size in batch_sizes:
    start_time = time.time()
    _ = get_predictions_batch(list(test_pairs)[:10000], batch_size=batch_size)  # Use a subset for testing
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Batch size {batch_size}: {elapsed_time:.2f} seconds")
    if elapsed_time < min_time:
        min_time = elapsed_time
        optimal_batch_size = batch_size

print(f"Optimal batch size: {optimal_batch_size}")

In [None]:
# Generate final predictions
print("Generating final predictions...")
predictions = get_predictions_batch(test_pairs, batch_size=optimal_batch_size)

In [None]:
# Create submission DataFrame
print("Creating submission DataFrame...")
submission = pd.DataFrame.from_dict(predictions, orient='index', columns=['rating'])
submission.index.name = 'Id'
submission.reset_index(inplace=True)

In [None]:
# Save submission
print("Saving submission to CSV...")
submission.to_csv('submission.csv', index=False)
print("Done!")