In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

### Нагенерим фичей: подготовим теги и жанры к работе с текстом, вычислим среднюю оценку по фильму, разброс

#### Предобработка жанров

In [4]:
def get_space_genres(param):
    return ' '.join(param.replace(' ', '').replace('-', '').split('|'))

In [5]:
movies['genres'] = movies['genres'].apply(get_space_genres)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Предобработка тегов

In [6]:
tag_strings = []
movieId = []

for movie, group in tags.groupby('movieId'):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movieId.append(movie)

In [7]:
tags_df = pd.DataFrame(tag_strings, movieId, columns=['tags']).reset_index().rename(columns={'index':'movieId'})
tags_df

Unnamed: 0,movieId,tags
0,1,pixar pixar fun
1,2,fantasy magicboardgame RobinWilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Comedy funny RachelMcAdams
1568,184471,adventure AliciaVikander videogameadaptation
1569,187593,JoshBrolin RyanReynolds sarcasm
1570,187595,EmiliaClarke starwars


#### Поработаем с рейтингом

In [8]:
df_ratings = ratings.groupby('movieId').agg([np.mean, np.std, np.median, np.var])['rating'].reset_index()

In [9]:
df_ratings = df_ratings.rename(columns={'mean':'mean_rating'})
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9724 entries, 0 to 9723
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      9724 non-null   int64  
 1   mean_rating  9724 non-null   float64
 2   std          6278 non-null   float64
 3   median       9724 non-null   float64
 4   var          6278 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 380.0 KB


#### Соединим DF

In [10]:
df = movies.merge(tags_df, on='movieId', how='left')
df = df.merge(df_ratings, on='movieId', how='left')

In [11]:
df_1 = ratings.merge(df)

In [12]:
df_1['std'] = df_1['std'].fillna(0)
df_1['var'] = df_1['var'].fillna(0)

In [13]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId       100836 non-null  int64  
 1   movieId      100836 non-null  int64  
 2   rating       100836 non-null  float64
 3   timestamp    100836 non-null  int64  
 4   title        100836 non-null  object 
 5   genres       100836 non-null  object 
 6   tags         48287 non-null   object 
 7   mean_rating  100836 non-null  float64
 8   std          100836 non-null  float64
 9   median       100836 non-null  float64
 10  var          100836 non-null  float64
dtypes: float64(5), int64(3), object(3)
memory usage: 9.2+ MB


#### Достанем год из названия фильма

In [14]:
import re

In [15]:
def get_year(param):
    res = re.search(r'\d{4}', param)
    if res != None:
        return int(res.group(0))
    else:
        return 0

In [16]:
df_1['year'] = df_1['title'].apply(get_year)

#### Преобразуем timestamp к году

In [17]:
def time_to_year(param):
    sec_in_year = 31536000
    res = param/sec_in_year + 1970
    return int(round(res, 0))

In [18]:
df_1['year_view'] = df_1['timestamp'].apply(time_to_year)

#### Добавим столбец с временем между выходом фильма и просмотром

In [19]:
df_1['year_view'].median() - df_1['year'].median()

11.0

In [20]:
def get_time_beetwen(row):
    if row['year'] != 0 and row['year'] < row['year_view']:
        return row['year_view'] - row['year']
    else:
        return 11

In [21]:
df_1['time_between'] = df_1.apply(get_time_beetwen, axis=1)

#### Удaлим ненужные столбцы

In [22]:
df_2 = df_1.drop(columns=['userId', 'movieId', 'timestamp', 'title'])

In [23]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   rating        100836 non-null  float64
 1   genres        100836 non-null  object 
 2   tags          48287 non-null   object 
 3   mean_rating   100836 non-null  float64
 4   std           100836 non-null  float64
 5   median        100836 non-null  float64
 6   var           100836 non-null  float64
 7   year          100836 non-null  int64  
 8   year_view     100836 non-null  int64  
 9   time_between  100836 non-null  int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 8.5+ MB


### Преобразуем TfIdf

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

In [25]:
cv = CountVectorizer()
x_count_genres = cv.fit_transform(df_2['genres'])

In [26]:
td = TfidfTransformer()
x_td_genres = td.fit_transform(x_count_genres)

In [27]:
df_td_genres = pd.DataFrame(x_td_genres.toarray())

In [28]:
df_td_genres.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.363885,0.549735,0.508407,0.291944,0.0,0.0,0.0,0.470819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Concat

In [46]:
df_with_tf_genres = pd.concat([df_2, df_td_genres], axis=1).drop(columns=['genres', 'tags'])

In [47]:
df_with_tf_genres

Unnamed: 0,rating,mean_rating,std,median,var,year,year_view,time_between,0,1,...,10,11,12,13,14,15,16,17,18,19
0,4.0,3.92093,0.834859,4.0,0.69699,1995,2001,6,0.000000,0.363885,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,4.0,3.92093,0.834859,4.0,0.69699,1995,1997,2,0.000000,0.363885,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,4.5,3.92093,0.834859,4.0,0.69699,1995,2005,10,0.000000,0.363885,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,2.5,3.92093,0.834859,4.0,0.69699,1995,2018,23,0.000000,0.363885,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,4.5,3.92093,0.834859,4.0,0.69699,1995,2011,16,0.000000,0.363885,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,2.5,2.50000,0.000000,2.5,0.00000,1997,2017,20,0.683829,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.729642,0.0,0.0
100832,4.5,4.50000,0.000000,4.5,0.00000,1971,2017,46,0.545073,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
100833,3.0,3.00000,0.000000,3.0,0.00000,2005,2017,12,0.590016,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.629544,0.0,0.0
100834,3.5,3.50000,0.000000,3.5,0.00000,2016,2017,1,0.000000,0.000000,...,0.840475,0.0,0.0,0.0,0.0,0.0,0.0,0.541851,0.0,0.0


### Проба предсказать значения

In [54]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [48]:
x = df_with_tf_genres.drop(columns=['rating'])
y = df_with_tf_genres['rating']

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

#### Рандомный лес

In [50]:
rf = RandomForestRegressor(n_estimators = 10)
rf.fit(x_train, y_train)

RandomForestRegressor(n_estimators=10)

In [51]:
y_pred = rf.predict(x_test)

In [52]:
mean_squared_error(y_test, y_pred, squared=False)

0.9944458026260693

In [53]:
mean_squared_error(y_test, y_pred)

0.9889224543606071

In [55]:
r2_score(y_test, y_pred)

0.08231379104094982

#### Уберем некоторые признаки

In [89]:
x_1 = x.drop(columns=['median', 'var', 'year', 'year_view', 'time_between'])
# ['mean_rating', 'std', 'median', 'var', 'year', 'year_view', 'time_between']

In [90]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x_1, y, test_size= 0.3, random_state=42)

rf_1 = RandomForestRegressor(n_estimators = 10)
rf_1.fit(x1_train, y1_train)

y1_pred = rf_1.predict(x1_test)

mean_squared_error(y1_test, y1_pred, squared=False)

0.9387921739817295

In [91]:
r2_score(y1_test, y1_pred)

0.18215521600844453

#### Попробуем добавить tfidf по тегам, закрывая глаза, что там много пустых значений

In [30]:
df_2['tags'] = df_2['tags'].fillna('')

In [31]:
cv_1 = CountVectorizer()
x_count_tags = cv.fit_transform(df_2['tags'])

In [32]:
td_1 = TfidfTransformer()
x_td_tags = td.fit_transform(x_count_tags)

In [33]:
df_td_tags = pd.DataFrame(x_td_tags.toarray())

In [34]:
df_with_tf = pd.concat([df_2, df_td_genres, df_td_tags], axis=1).drop(columns=['genres', 'tags'])

In [35]:
df_with_tf.head()

Unnamed: 0,rating,mean_rating,std,median,var,year,year_view,time_between,0,1,...,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468
0,4.0,3.92093,0.834859,4.0,0.69699,1995,2001,6,0.0,0.363885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,3.92093,0.834859,4.0,0.69699,1995,1997,2,0.0,0.363885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.5,3.92093,0.834859,4.0,0.69699,1995,2005,10,0.0,0.363885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.5,3.92093,0.834859,4.0,0.69699,1995,2018,23,0.0,0.363885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.5,3.92093,0.834859,4.0,0.69699,1995,2011,16,0.0,0.363885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Пробуем предсказать значения

In [92]:
x = df_with_tf.drop(columns=['rating', 'median', 'var', 'year', 'year_view', 'time_between'])
# ['mean_rating', 'std', 'median', 'var', 'year', 'year_view', 'time_between']
y = df_with_tf['rating']

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

rf = RandomForestRegressor(n_estimators = 10)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

mean_squared_error(y_test, y_pred, squared=False)

0.9391993315116507

In [94]:
r2_score(y1_test, y1_pred)

0.18215521600844453

### При прочих равных мы видим, что результат без tfidf на тегах лучше, чем с ними
#### Но все равно оставляет желать лучшего

#### Подбор гиперпараметров и кросс-валидацию не делал, пока что стандартным алгоритмом обошелся