In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import io, pickle, zipfile

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from zipfile import ZipFile

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings("ignore")

### Задание

1. Использовать датасет MovieLens.

2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    - TF-IDF на тегах и жанрах

    - средние оценки (+ median, variance и т. д.) пользователя и фильма
    
3. Оценить RMSE на тестовой выборке.

In [2]:
# посмотрим какие файлы есть в архиве 
with ZipFile('ml-latest-small.zip', 'r') as myzip:
    for item in myzip.infolist():
        print(f'File Name: {item.filename} ')

File Name: ml-latest-small/ 
File Name: ml-latest-small/links.csv 
File Name: ml-latest-small/tags.csv 
File Name: ml-latest-small/ratings.csv 
File Name: ml-latest-small/README.txt 
File Name: ml-latest-small/movies.csv 


In [3]:
# извлечение из архива
with ZipFile('ml-latest-small.zip', 'r') as myzip:
    myzip.extractall()

links = pd.read_csv('ml-latest-small/links.csv ')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')   

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
# средний рейтинг, который выставляет пользователь
mean_rating_user = ratings.groupby('userId').agg({'rating': ['mean']}). reset_index()
mean_rating_user.columns = ['userId', 'mean_rating_user']
mean_rating_user

Unnamed: 0,userId,mean_rating_user
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364
...,...,...
605,606,3.657399
606,607,3.786096
607,608,3.134176
608,609,3.270270


In [6]:
# средний рейтинг пользователей добавим как признак
rating_by_user = ratings.join(mean_rating_user.set_index('userId'), on='userId')
del rating_by_user['timestamp']
rating_by_user

Unnamed: 0,userId,movieId,rating,mean_rating_user
0,1,1,4.0,4.366379
1,1,3,4.0,4.366379
2,1,6,4.0,4.366379
3,1,47,5.0,4.366379
4,1,50,5.0,4.366379
...,...,...,...,...
100831,610,166534,4.0,3.688556
100832,610,168248,5.0,3.688556
100833,610,168250,5.0,3.688556
100834,610,168252,5.0,3.688556


In [7]:
# средний рейтинг фильма
mean_rating_film = ratings.groupby('movieId').agg({'rating': ['mean']}). reset_index()
mean_rating_film.columns = ['movieId', 'mean_rating_film']
mean_rating_film

Unnamed: 0,movieId,mean_rating_film
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [8]:
# сведем все рейтинги
ratings_all = rating_by_user.join(mean_rating_film.set_index('movieId'), on='movieId')
ratings_all

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film
0,1,1,4.0,4.366379,3.920930
1,1,3,4.0,4.366379,3.259615
2,1,6,4.0,4.366379,3.946078
3,1,47,5.0,4.366379,3.975369
4,1,50,5.0,4.366379,4.237745
...,...,...,...,...,...
100831,610,166534,4.0,3.688556,3.333333
100832,610,168248,5.0,3.688556,4.142857
100833,610,168250,5.0,3.688556,3.633333
100834,610,168252,5.0,3.688556,4.280000


In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [10]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [11]:
movies_with_tags = movies.merge(tags, on='movieId')
del movies_with_tags['timestamp']
movies_with_tags

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game
...,...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars
3679,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime
3680,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy
3681,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama


In [12]:
r_m_t = ratings_all.merge(movies_with_tags, how = 'left')
r_m_t

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film,title,genres,tag
0,1,1,4.0,4.366379,3.920930,,,
1,1,3,4.0,4.366379,3.259615,,,
2,1,6,4.0,4.366379,3.946078,,,
3,1,47,5.0,4.366379,3.975369,,,
4,1,50,5.0,4.366379,4.237745,,,
...,...,...,...,...,...,...,...,...
102672,610,166534,4.0,3.688556,3.333333,,,
102673,610,168248,5.0,3.688556,4.142857,John Wick: Chapter Two (2017),Action|Crime|Thriller,Heroic Bloodshed
102674,610,168250,5.0,3.688556,3.633333,,,
102675,610,168252,5.0,3.688556,4.280000,,,


In [15]:
# оставим только те строки, где присутствуют тэги
r_m_t = r_m_t.dropna()
r_m_t = r_m_t.reset_index(drop=True)

r_m_t

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film,title,genres,tag
0,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),Comedy,funny
1,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),Comedy,Highly quotable
2,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),Comedy,will ferrell
3,2,89774,5.0,3.948276,3.727273,Warrior (2011),Drama,Boxing story
4,2,89774,5.0,3.948276,3.727273,Warrior (2011),Drama,MMA
...,...,...,...,...,...,...,...,...
3471,606,6107,4.0,3.657399,4.000000,Night of the Shooting Stars (Notte di San Lore...,Drama|War,World War II
3472,606,7382,4.5,3.657399,4.250000,I'm Not Scared (Io non ho paura) (2003),Drama|Mystery|Thriller,for katie
3473,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),Action|Crime|Drama|Thriller,gun fu
3474,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),Action|Crime|Drama|Thriller,heroic bloodshed


In [16]:
r_m_t[r_m_t['movieId'] == 1].isnull().sum() 

userId              0
movieId             0
rating              0
mean_rating_user    0
mean_rating_film    0
title               0
genres              0
tag                 0
dtype: int64

In [17]:
r_m_t[r_m_t['movieId'] == 1]

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film,title,genres,tag
614,336,1,4.0,4.321429,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar
912,474,1,4.0,3.398956,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar
2700,567,1,3.5,2.245455,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,fun


In [18]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')).lower()

In [19]:
movie_genres = [change_string(g) for g in r_m_t.genres.values]
movie_genres[:5]

['comedy', 'comedy', 'comedy', 'drama', 'drama']

In [20]:
tfidf_genre = TfidfVectorizer()
genre_tfidf = tfidf_genre.fit_transform(movie_genres)
genre_tfidf

<3476x20 sparse matrix of type '<class 'numpy.float64'>'
	with 9463 stored elements in Compressed Sparse Row format>

In [21]:
genre_ = pd.DataFrame(genre_tfidf.toarray(), columns=tfidf_genre.get_feature_names_out())
genre_

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
2,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.352848,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.935681,0.0
3472,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.372715,0.0,0.0,0.0,0.0,0.0,0.765909,0.0,0.0,0.0,0.523896,0.000000,0.0
3473,0.561435,0.0,0.0,0.0,0.0,0.564029,0.0,0.351021,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.493402,0.000000,0.0
3474,0.561435,0.0,0.0,0.0,0.0,0.564029,0.0,0.351021,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.493402,0.000000,0.0


In [22]:
g_tfidf = pd.concat([r_m_t, genre_], axis=1)
del g_tfidf['genres']
g_tfidf

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film,title,tag,action,adventure,animation,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),funny,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
1,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),Highly quotable,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
2,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),will ferrell,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
3,2,89774,5.0,3.948276,3.727273,Warrior (2011),Boxing story,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
4,2,89774,5.0,3.948276,3.727273,Warrior (2011),MMA,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,606,6107,4.0,3.657399,4.000000,Night of the Shooting Stars (Notte di San Lore...,World War II,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.935681,0.0
3472,606,7382,4.5,3.657399,4.250000,I'm Not Scared (Io non ho paura) (2003),for katie,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.765909,0.0,0.0,0.0,0.523896,0.000000,0.0
3473,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),gun fu,0.561435,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.493402,0.000000,0.0
3474,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),heroic bloodshed,0.561435,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.493402,0.000000,0.0


In [23]:
def change_tag(s):
    return str(s).replace(' ', '').replace('-', '').lower()

movie_tags = [change_tag(t) for t in r_m_t.tag.values]
movie_tags[:5]

['funny', 'highlyquotable', 'willferrell', 'boxingstory', 'mma']

In [24]:
tfidf_tag = TfidfVectorizer()
tag_tfidf = tfidf_tag.fit_transform(movie_tags)
tag_tfidf

<3476x1436 sparse matrix of type '<class 'numpy.float64'>'
	with 3512 stored elements in Compressed Sparse Row format>

In [25]:
tag_ = pd.DataFrame(tag_tfidf.toarray(), columns=tfidf_tag.get_feature_names_out())
tag_

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
gt_tfidf = pd.concat([g_tfidf, tag_], axis=1)
del gt_tfidf['tag']

gt_tfidf

Unnamed: 0,userId,movieId,rating,mean_rating_user,mean_rating_film,title,action,adventure,animation,children,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,60756,5.0,3.948276,3.553571,Step Brothers (2008),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,89774,5.0,3.948276,3.727273,Warrior (2011),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,89774,5.0,3.948276,3.727273,Warrior (2011),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,606,6107,4.0,3.657399,4.000000,Night of the Shooting Stars (Notte di San Lore...,0.000000,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3472,606,7382,4.5,3.657399,4.250000,I'm Not Scared (Io non ho paura) (2003),0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3473,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),0.561435,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3474,610,3265,5.0,3.688556,4.000000,Hard-Boiled (Lat sau san taam) (1992),0.561435,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
gt_tfidf.drop(['userId', 'movieId', 'title'], axis= 1 , inplace= True)

In [28]:
gt_tfidf.head(3)

Unnamed: 0,rating,mean_rating_user,mean_rating_film,action,adventure,animation,children,comedy,crime,documentary,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,5.0,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
y = gt_tfidf['rating']
# целевая переменная преобразуется в категориальную переменную
y = y.astype('category').cat.codes
del gt_tfidf['rating']
X = gt_tfidf
X.head()

Unnamed: 0,mean_rating_user,mean_rating_film,action,adventure,animation,children,comedy,crime,documentary,drama,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.948276,3.553571,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.948276,3.727273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.948276,3.727273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# десять классов. Несбалансированные
y.value_counts()

7    999
9    883
6    577
8    496
5    274
3    102
4     80
1     31
2     26
0      8
dtype: int64

In [33]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

# классы несбалансированные
# попробую взвесить классы
from sklearn.utils.class_weight import compute_class_weight

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [35]:
# Получение весов классов
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

In [36]:
model = LogisticRegression(multi_class = 'ovr',
                           class_weight=dict(enumerate(class_weights)))

model = model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [37]:
# train accuracy
X_train_score = model.score(X_train, y_train) 
X_train_score

0.5482014388489208

In [38]:
# test accuracy
X_test_score = model.score(X_test,y_test)
X_test_score

0.3146551724137931

In [39]:
mean_squared_error(y_test, predictions, squared=False)

2.2357466802817365