# Task

1. Использовать dataset [MovieLens](https://grouplens.org/datasets/movielens/latest/)

2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

TF-IDF на тегах и жанрах

Средние оценки (+ median, variance, etc.) пользователя и фильма

3. Оценить RMSE на тестовой выборке

# Load data

In [10]:
!wget "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

--2022-08-12 18:53:36--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.1’


2022-08-12 18:53:36 (4.68 MB/s) - ‘ml-latest-small.zip.1’ saved [978202/978202]



In [11]:
import zipfile

z = zipfile.ZipFile('ml-latest-small.zip')
z.printdir()

File Name                                             Modified             Size
ml-latest-small/                               2018-09-26 15:50:12            0
ml-latest-small/links.csv                      2018-09-26 15:50:10       197979
ml-latest-small/tags.csv                       2018-09-26 15:49:40       118660
ml-latest-small/ratings.csv                    2018-09-26 15:49:38      2483723
ml-latest-small/README.txt                     2018-09-26 15:50:12         8342
ml-latest-small/movies.csv                     2018-09-26 15:49:56       494431


In [12]:
with zipfile.ZipFile('/content/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [13]:
import pandas as pd

ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [14]:
ratings.shape

(100836, 4)

In [15]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [16]:
movies.shape

(9742, 3)

In [17]:
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [18]:
tags.shape

(3683, 4)

# Preprocessing

In [19]:
tags['tag'] = tags['tag'] + ' '
tag_full = tags.groupby(['userId', 'movieId']).apply(lambda x: x.sum())
tag_full = tag_full['tag']
tag_full.head(3)

userId  movieId
2       60756           funny Highly quotable will ferrell 
        89774                   Boxing story MMA Tom Hardy 
        106782     drugs Leonardo DiCaprio Martin Scorsese 
Name: tag, dtype: object

In [20]:
df = ratings.merge(movies, how='left', left_on=['movieId'], right_on=['movieId'])
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller


In [21]:
df.shape

(100836, 6)

In [22]:
df = df.merge(tags, how='left', left_on=['userId', 'movieId'], right_on=['userId', 'movieId'])
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,


In [23]:
df.shape

(102677, 8)

In [24]:
df = df.merge(tag_full, how='left', left_on=['userId', 'movieId'], right_index=True)
df.rename(columns = {'tag_x' : 'tag', 'tag_y' : 'tag_full',
                     'timestamp_x' : 'timestamp_rating', 'timestamp_y' : 'timestamp_tag'}, inplace = True)
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp_rating,title,genres,tag,timestamp_tag,tag_full
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,,


In [25]:
df['tag'].fillna('', inplace=True)
df['tag_full'].fillna('', inplace=True)
df['timestamp_tag'].fillna(0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102677 entries, 0 to 102676
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   userId            102677 non-null  int64  
 1   movieId           102677 non-null  int64  
 2   rating            102677 non-null  float64
 3   timestamp_rating  102677 non-null  int64  
 4   title             102677 non-null  object 
 5   genres            102677 non-null  object 
 6   tag               102677 non-null  object 
 7   timestamp_tag     102677 non-null  float64
 8   tag_full          102677 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 7.8+ MB


In [26]:
df['genres'] = df['genres'].apply(lambda x: x.replace('|', ' '))
df.loc[:, 'genres_tag_full'] = df.loc[:, 'genres'] + ' ' + df.loc[:, 'tag_full']
df.loc[:, 'genres_tag'] = df.loc[:, 'genres'] + ' ' + df.loc[:, 'tag']
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp_rating,title,genres,tag,timestamp_tag,tag_full,genres_tag_full,genres_tag
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,,0.0,,Adventure Animation Children Comedy Fantasy,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,,0.0,,Comedy Romance,Comedy Romance
2,1,6,4.0,964982224,Heat (1995),Action Crime Thriller,,0.0,,Action Crime Thriller,Action Crime Thriller


In [27]:
df.shape

(102677, 11)

In [28]:
from sklearn.model_selection import train_test_split

X = df.drop(['rating'], axis=1)
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=X['userId'])
print(X_train.shape)
print(X_test.shape)

(92409, 10)
(10268, 10)


In [29]:
# каждому объекту ('userId', 'movieId') среднюю оценку за фильм и среднюю оценку юзера
users = pd.concat([X_train,y_train], axis=1).groupby('userId').mean('rating')
users = users['rating'].to_dict()

movies = pd.concat([X_train,y_train], axis=1).groupby('movieId').mean('rating')
movies = movies['rating'].to_dict()

In [30]:
X_train['users'] = X_train['userId'].apply(lambda x: users[x])
X_train['movies'] = X_train['movieId'].apply(lambda x: movies[x])
X_train.head(3)

Unnamed: 0,userId,movieId,timestamp_rating,title,genres,tag,timestamp_tag,tag_full,genres_tag_full,genres_tag,users,movies
51680,331,4973,1537157678,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy Romance,,0.0,,Comedy Romance,Comedy Romance,3.596026,4.194444
11416,68,6323,1158533404,Identity (2003),Crime Horror Mystery Thriller,,0.0,,Crime Horror Mystery Thriller,Crime Horror Mystery Thriller,3.246473,3.681818
84612,532,290,1025570521,Once Were Warriors (1994),Crime Drama,,0.0,,Crime Drama,Crime Drama,4.311111,3.875


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

tfidf = TfidfVectorizer()
X_train = pd.concat([pd.DataFrame(tfidf.fit_transform(X_train['genres_tag_full']).toarray(), 
                                  columns=tfidf.get_feature_names_out()).reset_index(drop=True), 
                     X_train['users'].reset_index(drop=True),
                     X_train['movies'].reset_index(drop=True)
                     ],
                    axis=1)
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)
y_train
print(X_train.shape)

X_test['users'] = X_test['userId'].apply(lambda x: users[x])
X_test['movies'] = X_test['movieId'].apply(lambda x: movies.get(x, 3.5))  # Add average for all movies
X_test = pd.concat([pd.DataFrame(tfidf.transform(X_test['genres_tag']).toarray(), 
                                columns=tfidf.get_feature_names_out()).reset_index(drop=True),
                    X_test['users'].reset_index(drop=True),
                    X_test['movies'].reset_index(drop=True)
                    ],
                   axis=1)
ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)
y_test
print(X_test.shape)

(92409, 1683)
(10268, 1683)


# Modeling lightgbm

In [32]:
import lightgbm as lgb

lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train, y_train)


LGBMRegressor()

In [33]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, lgbm.predict(X_test), squared=False)

0.8882448697693305

In [34]:
from sklearn import metrics

metrics.r2_score(y_test, lgbm.predict(X_test))

0.28543858060173843

# Modeling RandomForest

In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)


RandomForestRegressor()

In [36]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, rf.predict(X_test), squared=False)

0.9190290870347713

In [37]:
from sklearn import metrics

metrics.r2_score(y_test, rf.predict(X_test))

0.23505068112364114

# Modeling KNeighbors

In [38]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)


KNeighborsRegressor(n_neighbors=3)

In [39]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, knn.predict(X_test), squared=False)

1.000158246146784

In [40]:
from sklearn import metrics

metrics.r2_score(y_test, knn.predict(X_test))

0.09403464860721522