In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [423]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [424]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [425]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [426]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [427]:
def normed_user_rate_func(x):
    """
    Функция для нормирования оценки пользователя
    """
    if x.max() != x.min():
        return (x - x.min())/(x.max() - x.min())
    else:
        return 1
    
ratings['normed_user_rating'] = ratings.groupby('userId', sort=False)['rating'].transform(normed_user_rate_func)

In [428]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,normed_user_rating
0,1,1,4.0,964982703,0.75
1,1,3,4.0,964981247,0.75


In [429]:
# количество оценок по фильмам
movies_rating_count = ratings.groupby('movieId', sort=False)['rating'].count()
# средняя нормированная оценка пользователей для каждого фильма
movies_norm_mean_rating = ratings.groupby('movieId', sort=False)['normed_user_rating'].mean()
# средняя оценка фильма по пользователям
movies_mean_rating = ratings.groupby('movieId', sort=False)[['rating']].mean()

In [430]:
movies = pd.merge(movies, pd.DataFrame(movies_rating_count).rename(columns={'rating':'rating_count'}), 
                  left_on='movieId', right_index=True)

In [431]:
movies = pd.merge(movies, movies_mean_rating.rename(columns={'rating':'mean_rating'}), 
                  left_on='movieId', right_index=True)

In [432]:
movies.head(2)

Unnamed: 0,movieId,title,genres,rating_count,mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818


In [433]:
# среднее количество оценок фильма
mean_num_rate = movies_rating_count.mean()
std_num_rate = movies_rating_count.std()

In [434]:
# нормированная средняя оценка фильма
movies_mean_normed_rating = pd.DataFrame(movies_norm_mean_rating*
        (movies_rating_count - mean_num_rate)/std_num_rate, columns=['movie_normed_mean_rating'])

In [435]:
movies_mean_normed_rating.head(2)

Unnamed: 0_level_0,movie_normed_mean_rating
movieId,Unnamed: 1_level_1
1,6.584792
3,0.977352


In [436]:
movies = pd.merge(movies, movies_mean_normed_rating, left_on='movieId', right_index=True)

In [437]:
movies.head(2)

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,6.584792
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818,2.674611


In [438]:
# количество тэгов для каждого фильма
movies_tag_count = tags.groupby('movieId')[['tag']].count()
movies_tag_count.columns = ['num_of_tags']

In [439]:
movies_tag_count.head(2)

Unnamed: 0_level_0,num_of_tags
movieId,Unnamed: 1_level_1
1,3
2,4


In [440]:
movies = pd.merge(movies, movies_tag_count, left_on='movieId', right_index=True)

In [441]:
movies.head(2)

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,6.584792,3
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818,2.674611,4


In [442]:
tags['tag'] = tags['tag'].str.lower()
movie_tags = tags.groupby('movieId')[['tag']].agg(' '.join)
movie_tags.columns = ['tags']

In [443]:
movie_tags.head(2)

Unnamed: 0_level_0,tags
movieId,Unnamed: 1_level_1
1,pixar pixar fun
2,fantasy magic board game robin williams game


In [444]:
movies = pd.merge(movies, movie_tags, left_on='movieId', right_index=True)

In [445]:
movies.head(2)

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,6.584792,3,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,110,3.431818,2.674611,4,fantasy magic board game robin williams game


In [446]:
import re

def extract_year(row):
    match = re.search('\(\D*\d{4}\D*\)', row)
    if match:
        return int(re.search('\d{4}', match[0])[0])
    return 0

In [447]:
movies['year'] = movies['title'].apply(extract_year)

In [448]:
movies['year'].unique()

array([1995, 1994, 1996, 1976, 1992, 1964, 1977, 1993, 1982, 1990, 1991,
       1989, 1937, 1940, 1970, 1955, 1959, 1972, 1952, 1951, 1961, 1958,
       1954, 1934, 1944, 1960, 1963, 1942, 1941, 1953, 1939, 1950, 1968,
       1946, 1945, 1938, 1947, 1935, 1936, 1956, 1932, 1969, 1981, 1965,
       1988, 1979, 1967, 1987, 1986, 1975, 1980, 1957, 1966, 1962, 1971,
       1983, 1949, 1985, 1974, 1984, 1973, 1948, 1931, 1978, 1922, 1997,
       1998, 1943, 1999, 2000, 1928, 1921, 1925, 1933, 2001, 1926, 1930,
       2002, 2003, 2004, 1923, 1927, 1924, 1929, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2018, 2017, 2016,    0])

In [449]:
movies[movies['year']==0]

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
9259,156605,Paterson,(no genres listed),1,4.5,-0.371801,3,quirky sweet understated,0


In [450]:
movies['genres'] = movies['genres'].str.lower().str.replace('|', ' ')

In [451]:
movies.head()

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
0,1,Toy Story (1995),adventure animation children comedy fantasy,215,3.92093,6.584792,3,pixar pixar fun,1995
1,2,Jumanji (1995),adventure children fantasy,110,3.431818,2.674611,4,fantasy magic board game robin williams game,1995
2,3,Grumpier Old Men (1995),comedy romance,52,3.259615,0.977352,2,moldy old,1995
4,5,Father of the Bride Part II (1995),comedy,49,3.071429,0.748352,2,pregnancy remake,1995
6,7,Sabrina (1995),comedy romance,54,3.185185,0.947953,1,remake,1995


In [452]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [453]:
corpus_1 = movies['genres']
corpus_2 = movies['tags']
vectorizer_1 = TfidfVectorizer()
vectorizer_2 = TfidfVectorizer()
X1 = vectorizer_1.fit_transform(corpus_1)
X2 = vectorizer_2.fit_transform(corpus_2)

In [454]:
from scipy.sparse import csr_matrix, hstack

In [455]:
X3 = csr_matrix(movies.drop(['genres', 'tags'], axis=1).iloc[:,2:].values)

In [456]:
X1.shape, X2.shape, X3.shape

((1554, 24), (1554, 1742), (1554, 5))

In [457]:
X = hstack([X1, X2, X3])

In [458]:
from sklearn.preprocessing import MaxAbsScaler

In [459]:
# проведем масштабирование данных с помощью MaxAbsScaler (чтобы матрица данных осталась разреженной)
scaler = MaxAbsScaler()
X_sc = scaler.fit_transform(X)

In [460]:
movies.reset_index(drop=True, inplace=True)

In [461]:
# топ 100 фильмов без привязки к жанру (по нормированному рейтингу)
top_100 = movies.sort_values('movie_normed_mean_rating', ascending=False).head(10)
top_100.head(10)

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
83,318,"Shawshank Redemption, The (1994)",crime drama,317,4.429022,11.620828,4,prison stephen king wrongful imprisonment morg...,1994
94,356,Forrest Gump (1994),comedy drama romance war,329,4.164134,11.159282,9,shrimp vietnam bubba gump shrimp lieutenant da...,1994
77,296,Pulp Fiction (1994),comedy crime drama thriller,307,4.197068,10.37557,181,good dialogue great soundtrack non-linear cult...,1994
577,2571,"Matrix, The (1999)",action sci-fi thriller,278,4.192446,9.50525,5,martial arts sci-fi alternate universe philoso...,1999
146,593,"Silence of the Lambs, The (1991)",crime horror thriller,279,4.16129,9.455111,6,hannibal lector disturbing drama gothic psycho...,1991
65,260,Star Wars: Episode IV - A New Hope (1977),action adventure sci-fi,251,4.231076,8.634377,26,classic space action action sci-fi epic great ...,1977
637,2959,Fight Club (1999),action crime drama thriller,218,4.272936,7.568651,54,dark comedy psychology thought-provoking twist...,1999
36,110,Braveheart (1995),action drama war,237,4.031646,7.554489,10,beautiful scenery epic historical inspirationa...,1995
126,527,Schindler's List (1993),drama war,220,4.225,7.489698,7,moving thought-provoking holocaust based on a ...,1993
287,1196,Star Wars: Episode V - The Empire Strikes Back...,action adventure sci-fi,211,4.21564,7.155559,10,i am your father space space opera classic geo...,1980


In [138]:
# выберем случайным образом среди топ 100 - 3 фильма и найдем наиболее близкие к ним
ids = np.random.choice(top_100.index, 3)
ids

array([ 65,  83, 116])

In [462]:
movies.loc[ids,:]

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
65,260,Star Wars: Episode IV - A New Hope (1977),action adventure sci-fi,251,4.231076,8.634377,26,classic space action action sci-fi epic great ...,1977
83,318,"Shawshank Redemption, The (1994)",crime drama,317,4.429022,11.620828,4,prison stephen king wrongful imprisonment morg...,1994
116,480,Jurassic Park (1993),action adventure sci-fi thriller,238,3.75,6.860897,1,dinosaur,1993


In [463]:
from sklearn.neighbors import NearestNeighbors

In [464]:
knn = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='minkowski') 
knn.fit(X_sc)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=10, p=2, radius=1.0)

In [465]:
films = X_sc.tocsc()[ids,:]

In [466]:
result = []
for film in films:
    result.append(knn.kneighbors(film, return_distance=True))

In [467]:
# Рекомендации по фильму Star Wars: Episode IV - A New Hope (1977)
movies.loc[result[0][1][0][1:],:]

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
1299,33493,Star Wars: Episode III - Revenge of the Sith (...,action adventure sci-fi,78,3.429487,1.909589,2,space space opera,2005
116,480,Jurassic Park (1993),action adventure sci-fi thriller,238,3.75,6.860897,1,dinosaur,1993
449,1917,Armageddon (1998),action romance sci-fi thriller,92,3.054348,1.927614,1,space,1998
171,780,Independence Day (a.k.a. ID4) (1996),action adventure sci-fi thriller,202,3.445545,5.177877,1,aliens,1996
821,4446,Final Fantasy: The Spirits Within (2001),adventure animation fantasy sci-fi,31,3.354839,0.574179,1,sci-fi,2001
290,1200,Aliens (1986),action adventure horror sci-fi,126,3.964286,3.846565,9,action aliens horror sci-fi space space craft ...,1986
892,5349,Spider-Man (2002),action adventure sci-fi thriller,122,3.540984,3.2146,1,superhero,2002
409,1580,Men in Black (a.k.a. MIB) (1997),action comedy sci-fi,165,3.487879,4.326251,1,aliens,1997
592,2662,"War of the Worlds, The (1953)",action drama sci-fi,12,3.166667,0.038998,1,classic,1953


In [468]:
# Рекомендации по фильму Shawshank Redemption, The (1994)
movies.loc[result[1][1][0][1:],:]

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
669,3147,"Green Mile, The (1999)",crime drama,111,4.148649,3.555713,1,stephen king,1999
180,858,"Godfather, The (1972)",crime drama,192,4.289062,6.632262,1,mafia,1972
300,1213,Goodfellas (1990),crime drama,126,4.25,4.192264,1,mafia,1990
305,1221,"Godfather: Part II, The (1974)",crime drama,129,4.25969,4.288847,3,al pacino mafia mafia,1974
7,16,Casino (1995),crime drama,82,3.926829,2.365961,1,mafia,1995
37,111,Taxi Driver (1976),crime drama thriller,104,4.105769,3.213934,1,assassination,1976
1298,33166,Crash (2004),crime drama,50,3.89,1.30751,1,racism,2004
397,1466,Donnie Brasco (1997),crime drama,52,3.740385,1.260892,1,mafia,1997
864,4963,Ocean's Eleven (2001),crime thriller,119,3.844538,3.472857,1,heist,2001


In [469]:
# Рекомендации по фильму Jurassic Park (1993)
movies.loc[result[2][1][0][1:],:]

Unnamed: 0,movieId,title,genres,rating_count,mean_rating,movie_normed_mean_rating,num_of_tags,tags,year
171,780,Independence Day (a.k.a. ID4) (1996),action adventure sci-fi thriller,202,3.445545,5.177877,1,aliens,1996
892,5349,Spider-Man (2002),action adventure sci-fi thriller,122,3.540984,3.2146,1,superhero,2002
369,1356,Star Trek: First Contact (1996),action adventure sci-fi thriller,91,3.879121,2.548602,1,borg,1996
405,1544,"Lost World: Jurassic Park, The (1997)",action adventure sci-fi thriller,67,2.753731,1.164216,1,dinosaurs,1997
409,1580,Men in Black (a.k.a. MIB) (1997),action comedy sci-fi,165,3.487879,4.326251,1,aliens,1997
1443,91500,The Hunger Games (2012),action adventure drama sci-fi thriller,54,3.435185,1.146314,1,ending,2012
587,2640,Superman (1978),action adventure sci-fi,61,3.606557,1.479627,1,superhero,1978
841,4638,Jurassic Park III (2001),action adventure sci-fi thriller,36,2.847222,0.577122,1,dinosaurs,2001
603,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),action comedy sci-fi,120,3.775,3.434125,1,ghosts,1984


Попробуем предсказать средние оценки фильмам из отложенной выборки

In [470]:
X3 = csr_matrix(movies.drop(['genres', 'tags', 'mean_rating'], axis=1).iloc[:,2:].values)
y = movies['mean_rating']

In [471]:
X1.shape, X2.shape, X3.shape, y.shape

((1554, 24), (1554, 1742), (1554, 4), (1554,))

In [472]:
X = hstack([X1, X2, X3])

In [473]:
from sklearn.model_selection import train_test_split

In [474]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [475]:
from sklearn.preprocessing import MaxAbsScaler

In [476]:
# проведем масштабирование данных с помощью MaxAbsScaler (чтобы матрица данных осталась разреженной)
scaler = MaxAbsScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [477]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [478]:
model_lasso = Lasso()
model_lasso.fit(X_train_sc, y_train)

model_rfr = RandomForestRegressor(n_estimators=100)
model_rfr.fit(X_train_sc, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [479]:
y_pred_lasso = model_lasso.predict(X_test_sc)
y_pred_rfr = model_rfr.predict(X_test_sc)

In [480]:
from sklearn.metrics import mean_squared_error

In [481]:
RMSE_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
RMSE_rfr = np.sqrt(mean_squared_error(y_test, y_pred_rfr))

In [482]:
print(f'RMSE Lasso = {RMSE_lasso:.2f}')

RMSE Lasso = 0.55


In [483]:
print(f'RMSE Random Forest = {RMSE_rfr:.2f}')

RMSE Random Forest = 0.31


In [484]:
pd.DataFrame({'y_test' : y_test, 'y_pred_rfr' : y_pred_rfr}).head(20)

Unnamed: 0,y_test,y_pred_rfr
650,3.769231,3.332431
322,3.959302,3.805648
238,3.318182,3.602619
997,3.681818,3.414084
1013,3.75,3.649209
1456,4.107143,3.64224
347,4.210526,4.045693
404,3.714286,3.821918
1151,4.0,4.0
962,4.5,4.5175


Попробуем предсказать оценки, которые пользователи поставят фильмам из отложенной выборки

In [485]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,normed_user_rating
0,1,1,4.0,964982703,0.75
1,1,3,4.0,964981247,0.75
2,1,6,4.0,964982224,0.75
3,1,47,5.0,964983815,1.0
4,1,50,5.0,964982931,1.0


In [486]:
mean_user_rate = ratings.groupby('userId')[['rating']].mean()
mean_user_rate.head()

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
1,4.366379
2,3.948276
3,2.435897
4,3.555556
5,3.636364


In [487]:
ratings = pd.merge(ratings, mean_user_rate.rename(columns={'rating': 'mean_user_rating'}), 
                   left_on='userId', right_index=True)
ratings = pd.merge(ratings, movies_mean_rating.rename(columns={'rating': 'mean_film_rating'}), 
                   left_on='userId', right_index=True)

In [488]:
cols = ['movieId', 'genres', 'rating_count', 'movie_normed_mean_rating', 'num_of_tags', 'tags', 'year']

In [489]:
ratings = pd.merge(ratings, movies[cols], left_on='movieId', right_on='movieId')

In [490]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,normed_user_rating,mean_user_rating,mean_film_rating,genres,rating_count,movie_normed_mean_rating,num_of_tags,tags,year
0,1,1,4.0,964982703,0.75,4.366379,3.92093,adventure animation children comedy fantasy,215,6.584792,3,pixar pixar fun,1995
1,5,1,4.0,847434962,0.75,3.636364,3.071429,adventure animation children comedy fantasy,215,6.584792,3,pixar pixar fun,1995


In [491]:
corpus_1 = ratings['genres']
corpus_2 = ratings['tags']
vectorizer_1 = TfidfVectorizer()
vectorizer_2 = TfidfVectorizer()
X1 = vectorizer_1.fit_transform(corpus_1)
X2 = vectorizer_2.fit_transform(corpus_2)

In [492]:
X3 = csr_matrix(ratings.drop(['genres', 'tags', 'rating', 'normed_user_rating'], axis=1).iloc[:,3:].values)
y = ratings['rating']

In [493]:
X1.shape, X2.shape, X3.shape, y.shape

((42101, 24), (42101, 1738), (42101, 6), (42101,))

In [494]:
X = hstack([X1, X2, X3])

In [496]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [498]:
# проведем масштабирование данных с помощью MaxAbsScaler (чтобы матрица данных осталась разреженной)
scaler = MaxAbsScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [500]:
model_lasso = Lasso()
model_lasso.fit(X_train_sc, y_train)

model_rfr = RandomForestRegressor(n_estimators=100)
model_rfr.fit(X_train_sc, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [502]:
y_pred_lasso = model_lasso.predict(X_test_sc)
y_pred_rfr = model_rfr.predict(X_test_sc)

In [503]:
RMSE_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
RMSE_rfr = np.sqrt(mean_squared_error(y_test, y_pred_rfr))

In [504]:
print(f'RMSE Lasso = {RMSE_lasso:.2f}')

RMSE Lasso = 0.96


In [505]:
print(f'RMSE Random Forest = {RMSE_rfr:.2f}')

RMSE Random Forest = 0.83


In [506]:
pd.DataFrame({'y_test' : y_test, 'y_pred_rfr' : y_pred_rfr}).head(20)

Unnamed: 0,y_test,y_pred_rfr
17546,4.0,3.515
31239,4.5,3.115
34455,5.0,3.885
33867,4.0,3.625
16666,4.0,4.325
22083,5.0,3.515
30921,3.5,4.07
26491,3.5,4.26
36856,4.0,3.945
3762,5.0,3.865


Как видно из результатов, средние оценки фильмам можно предсказать довольно точно, в то время, как оценки пользователей предсказываются очень плохо.