## Домашнее задание по теме   
## «Рекомендации на основе содержания»

##### 1. Использовать датасет [MovieLens](https://grouplens.org/datasets/movielens/latest/).

##### 2.Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах;
* средние оценки (+ median, variance и т. д.) пользователя и фильма.

##### 3. Оценить RMSE на тестовой выборке.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
ratings['dt'] = ratings['timestamp'].apply(lambda t: datetime.fromtimestamp(t))
ratings['month'] = ratings.dt.dt.month
ratings['year'] = ratings.dt.dt.year
ratings.drop(columns = ['timestamp', 'dt'], inplace=True)

In [9]:
print(f' number of users = {ratings.userId.nunique()},\
      \n number of movies = {ratings.movieId.nunique()}')

 number of users = 610,      
 number of movies = 9724


In [11]:
userId_median = ratings.groupby('userId',as_index=False).median('rating')[['userId','rating']].\
                        rename(columns={'rating':'rating_user_median'})
movieId_median = ratings.groupby('movieId',as_index=False).median('rating')[['movieId','rating']].\
                        rename(columns={'rating':'rating_movie_median'})
userId_mean = ratings.groupby('userId',as_index=False).mean('rating')[['userId','rating']].\
                        rename(columns={'rating':'rating_user_mean'})
movieId_mean = ratings.groupby('movieId',as_index=False).mean('rating')[['movieId','rating']].\
                        rename(columns={'rating':'rating_movie_mean'})

In [13]:
ratings = ratings.merge(userId_median, on='userId', how='left').\
            merge(movieId_median, on='movieId', how='left').\
            merge(userId_mean, on='userId', how='left').\
            merge(movieId_mean, on='movieId', how='left')

In [15]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   userId               100836 non-null  int64  
 1   movieId              100836 non-null  int64  
 2   rating               100836 non-null  float64
 3   month                100836 non-null  int32  
 4   year                 100836 non-null  int32  
 5   rating_user_median   100836 non-null  float64
 6   rating_movie_median  100836 non-null  float64
 7   rating_user_mean     100836 non-null  float64
 8   rating_movie_mean    100836 non-null  float64
dtypes: float64(5), int32(2), int64(2)
memory usage: 6.2 MB


In [17]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [19]:
tags['tag']=tags['tag']+'|'

In [21]:
tags = tags.groupby(['userId', 'movieId'], as_index=False).agg({'tag':'sum'})

In [23]:
tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny|Highly quotable|will ferrell|
1,2,89774,Boxing story|MMA|Tom Hardy|
2,2,106782,drugs|Leonardo DiCaprio|Martin Scorsese|
3,7,48516,way too long|
4,18,431,Al Pacino|gangster|mafia|


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
def change_string_tag(s):
    return ' '.join(s.replace('-', '').split('|')).lower()

In [29]:
tags_tags = [change_string_tag(g) for g in tags.tag.values]

In [31]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(tags_tags)
X_train_tfidf

<1775x1748 sparse matrix of type '<class 'numpy.float64'>'
	with 5350 stored elements in Compressed Sparse Row format>

In [33]:
tags_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [35]:
tags_tfidf[['userId','movieId']]=tags[['userId','movieId']]

In [37]:
tags_tfidf.head()

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,250,...,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,userId,movieId
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,60756
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,89774
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,106782
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,48516
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,431


In [39]:
ratings = ratings.merge(tags_tfidf, on = ['userId', 'movieId'], how = 'left')

In [41]:
ratings

Unnamed: 0,userId,movieId,rating,month,year,rating_user_median,rating_movie_median,rating_user_mean,rating_movie_mean,06,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,1,4.0,7,2000,5.0,4.0,4.366379,3.920930,,...,,,,,,,,,,
1,1,3,4.0,7,2000,5.0,3.0,4.366379,3.259615,,...,,,,,,,,,,
2,1,6,4.0,7,2000,5.0,4.0,4.366379,3.946078,,...,,,,,,,,,,
3,1,47,5.0,7,2000,5.0,4.0,4.366379,3.975369,,...,,,,,,,,,,
4,1,50,5.0,7,2000,5.0,4.5,4.366379,4.237745,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,5,2017,3.5,4.0,3.688556,3.333333,,...,,,,,,,,,,
100832,610,168248,5.0,5,2017,3.5,4.0,3.688556,4.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100833,610,168250,5.0,5,2017,3.5,4.0,3.688556,3.633333,,...,,,,,,,,,,
100834,610,168252,5.0,5,2017,3.5,4.5,3.688556,4.280000,,...,,,,,,,,,,


In [43]:
ratings = ratings.fillna(0)

In [16]:
#ratings.drop_duplicates(['userId','movieId'], inplace=True)

In [45]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [47]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [49]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')).lower()

In [51]:
movie_genres = [change_string(g) for g in movies.genres.values]
#movie_genres[:5]

In [53]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [55]:
movie_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [57]:
movie_tfidf['movieId']=movies['movieId']

In [59]:
ratings = ratings.merge(movie_tfidf, on='movieId', how='left')

In [31]:
#ratings['tag'] = ratings['tag'].fillna('')

In [67]:
df = ratings #pd.concat([ratings, tags_tfidf], axis =1)

In [142]:
#del df['tag']

In [71]:
df.head()

Unnamed: 0,userId,movieId,rating,month,year,rating_user_median,rating_movie_median,rating_user_mean,rating_movie_mean,06,...,horror_y,imax,musical,mystery_y,nogenreslisted,romance_y,scifi_y,thriller_y,war_y,western_y
0,1,1,4.0,7,2000,5.0,4.0,4.366379,3.92093,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,7,2000,5.0,3.0,4.366379,3.259615,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
2,1,6,4.0,7,2000,5.0,4.0,4.366379,3.946078,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542042,0.0,0.0
3,1,47,5.0,7,2000,5.0,4.0,4.366379,3.975369,0.0,...,0.0,0.0,0.0,0.823735,0.0,0.0,0.0,0.566975,0.0,0.0
4,1,50,5.0,7,2000,5.0,4.5,4.366379,4.237745,0.0,...,0.0,0.0,0.0,0.685854,0.0,0.0,0.0,0.472071,0.0,0.0


In [73]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [75]:
y = pd.Series(df.rating)

In [77]:
del df['rating']

In [79]:
x = df

In [81]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [82]:
model = LinearRegression()

In [85]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [87]:
model.score(X_train, y_train)

0.408740773685471

In [89]:
model.score(X_test, y_test)

0.39801659664916667

In [91]:
mean_squared_error(y_test, predictions)

0.653431457284272