The goal is to build recommendations based on features such as TF-IDF on tags and genres, as well as on average ratings of users and films. I will evaluate the RMSE on the test set.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [None]:
tags['tag'] = tags['tag'].astype(str)
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
movies = movies.merge(movie_tags, on='movieId', how='left')

In [None]:
movies['tag'] = movies['tag'].fillna('')

In [None]:
movies['genres_tags'] = movies['genres'] + ' ' + movies['tag']

In [None]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
tfidf_matrix = tfidf.fit_transform(movies['genres_tags'])

In [None]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movies['movieId'])

In [None]:
movies = movies.join(tfidf_df, on='movieId')

In [None]:
user_stats = train_ratings.groupby('userId')['rating'].agg(['mean', 'median', 'var']).reset_index()

In [None]:
user_stats.columns = ['userId', 'user_mean', 'user_median', 'user_var']

In [None]:
movie_stats = train_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'var']).reset_index()

In [None]:
movie_stats.columns = ['movieId', 'movie_mean', 'movie_median', 'movie_var']

In [None]:
train_ratings = train_ratings.merge(user_stats, on='userId', how='left')

In [None]:
train_ratings = train_ratings.merge(movie_stats, on='movieId', how='left')

In [None]:
test_ratings = test_ratings.merge(user_stats, on='userId', how='left')

In [None]:
test_ratings = test_ratings.merge(movie_stats, on='movieId', how='left')

In [None]:
X_train = train_ratings[['user_mean', 'user_median', 'user_var', 'movie_mean', 'movie_median', 'movie_var']].fillna(0)
y_train = train_ratings['rating']

In [None]:
y_train = train_ratings['rating']

In [None]:
X_test = test_ratings[['user_mean', 'user_median', 'user_var', 'movie_mean', 'movie_median', 'movie_var']].fillna(0)
y_test = test_ratings['rating']

In [None]:
model = Ridge()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
rmse

1.0593731283481531