In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load and preprocess data
train_df = pd.read_csv("checkpoint_train.csv")
test_df = pd.read_csv("checkpoint_test.csv")    

text_columns = ['overview', 'tagline', 'title', 'all_keywords']
num_train = train_df.drop(columns = text_columns)
num_test = test_df.drop(columns = text_columns)

train_df[text_columns] = train_df[text_columns].fillna("").astype(str)
test_df[text_columns] = test_df[text_columns].fillna("").astype(str)

vectorizer = TfidfVectorizer()
vectorizer.fit(train_df['all_keywords'])
tfidf_train_matrix = vectorizer.transform(train_df['all_keywords'])
tfidf_test_matrix = vectorizer.transform(test_df['all_keywords'])

# Compute content-based similarity matrix
cosine_sim = cosine_similarity(tfidf_train_matrix)

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Train KNNWithMeans model for collaborative filtering
knn = KNNWithMeans(sim_options={'name': 'pearson_baseline', 'user_based': True})
knn.fit(train_data)

# Evaluate model on validation data
predictions = knn.test(val_data)
print("Validation RMSE:", accuracy.rmse(predictions))

# Prepare test data for predictions
test_df['userId'] = test_df['userId'].astype(int)
test_df['movieId'] = test_df['movieId'].astype(int)

# Predict ratings using hybrid approach
predictions = []
for _, row in test_df.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    
    # Get collaborative filtering prediction
    cf_pred = knn.predict(userId, movieId).est
    
    # Get content-based prediction
    movie_indices = train_df[train_df['movieId'] == movieId].index
    if movie_indices.size > 0:
        movie_index = movie_indices[0]
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_similar_indices = [i[0] for i in sim_scores[1:11]]
        top_similar_ratings = train_df.iloc[top_similar_indices]['rating']
        cb_pred = top_similar_ratings.mean()
    else:
        cb_pred = train_df['rating'].mean()
    
    # Combine predictions
    hybrid_pred = 0.5 * cf_pred + 0.5 * cb_pred
    predictions.append(hybrid_pred)

# Store predictions in test_df
test_df['rating'] = predictions

test_df['userId'] = test_df['userId'].astype(str)
test_df['movieId'] = test_df['movieId'].astype(str)
test_df['userId_movieId'] = test_df['userId'] + '_' + test_df['movieId']
cols = ['userId_movieId', 'rating']
test_df = test_df[cols]

# Save predictions to a CSV file
test_df.to_csv('submission_tfidf_justkeywords_hybrid_recommendation.csv', index=False)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1912
Validation RMSE: 0.19116012114984435
