In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df1 = pd.read_csv('ratings.csv')
df2 = pd.read_csv('movies.csv')
df = df1.merge(df2, on='movieId')[['userId', 'title', 'rating', 'genres']]
df

Unnamed: 0,userId,title,rating,genres
0,1,Dangerous Minds (1995),2.5,Drama
1,7,Dangerous Minds (1995),3.0,Drama
2,31,Dangerous Minds (1995),4.0,Drama
3,32,Dangerous Minds (1995),4.0,Drama
4,36,Dangerous Minds (1995),3.0,Drama
...,...,...,...,...
99556,664,War of the Worlds (2005),2.5,Action|Sci-Fi
99557,664,"Box, The (2009)",3.5,Drama|Horror|Mystery|Sci-Fi|Thriller
99558,665,Pie in the Sky (1996),3.0,Comedy|Romance
99559,665,Summer Catch (2001),1.0,Comedy|Drama|Romance


In [3]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_genres = tfidf_vectorizer.fit_transform(df['genres'])
user_stats = df.groupby('userId')['rating'].agg(['mean', 'median', 'var']).reset_index()
movie_stats = df.groupby('title')['rating'].agg(['mean', 'median', 'var']).reset_index()
df = df.merge(user_stats, on='userId', how='left').merge(movie_stats, on='title', how='left').fillna(0)
df

Unnamed: 0,userId,title,rating,genres,mean_x,median_x,var_x,mean_y,median_y,var_y
0,1,Dangerous Minds (1995),2.5,Drama,2.550000,2.5,0.786842,3.178571,3.0,0.705139
1,7,Dangerous Minds (1995),3.0,Drama,3.465909,3.0,0.872388,3.178571,3.0,0.705139
2,31,Dangerous Minds (1995),4.0,Drama,4.166667,4.5,0.299020,3.178571,3.0,0.705139
3,32,Dangerous Minds (1995),4.0,Drama,3.666667,4.0,0.865248,3.178571,3.0,0.705139
4,36,Dangerous Minds (1995),3.0,Drama,3.615385,3.5,0.588499,3.178571,3.0,0.705139
...,...,...,...,...,...,...,...,...,...,...
99556,664,War of the Worlds (2005),2.5,Action|Sci-Fi,3.799419,4.0,0.332019,2.841463,3.0,0.842988
99557,664,"Box, The (2009)",3.5,Drama|Horror|Mystery|Sci-Fi|Thriller,3.799419,4.0,0.332019,3.500000,3.5,0.000000
99558,665,Pie in the Sky (1996),3.0,Comedy|Romance,3.286374,3.0,0.899281,3.000000,3.0,0.000000
99559,665,Summer Catch (2001),1.0,Comedy|Drama|Romance,3.286374,3.0,0.899281,1.000000,1.0,0.000000


In [4]:
X = pd.concat([pd.DataFrame(tfidf_genres.toarray()), df[['mean_x', 'median_x', 'var_x', 'mean_y', 'median_y', 'var_y']]], axis=1)
X.columns = X.columns.astype(str)
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression().fit(X_train, y_train)

In [5]:
# смотрим разницу факта и прогноза
result = pd.DataFrame([np.array(y_test), model.predict(X_test)]).T
result.columns = ['y_test', 'y_pred']
result.head()

Unnamed: 0,y_test,y_pred
0,4.0,3.795165
1,4.0,3.455437
2,3.5,3.69726
3,2.0,3.091383
4,3.0,3.277907


In [6]:
# Оценка точности предсказания модели
rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
rmse

0.8339902106576605