In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt

In [4]:
links = pd.read_csv('D:\\Netology\\DS\\RS\\100к\\links.csv')
movies = pd.read_csv('D:\\Netology\\DS\\RS\\100к\\movies.csv')
ratings = pd.read_csv('D:\\Netology\\DS\\RS\\100к\\ratings.csv')
tags = pd.read_csv('D:\\Netology\\DS\\RS\\100к\\tags.csv')

In [5]:
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

movies_with_tags_ratings = pd.merge(joined_ratings, movies_with_tags, how='inner', on=['movieId', 'userId'])

movies_with_tags_ratings = movies_with_tags_ratings[['userId', 'movieId', 'rating', 'timestamp_x', 'title_x', 'genres_x', 'tag']]

movies_with_tags_ratings['genrestags'] = movies_with_tags_ratings['genres_x'] + '|' + movies_with_tags_ratings['tag']

movies_with_tags_ratings.groupby('movieId').head()

Unnamed: 0,userId,movieId,rating,timestamp_x,title_x,genres_x,tag,genrestags
0,2,60756,5.0,1445714980,Step Brothers (2008),Comedy,funny,Comedy|funny
1,2,60756,5.0,1445714980,Step Brothers (2008),Comedy,Highly quotable,Comedy|Highly quotable
2,2,60756,5.0,1445714980,Step Brothers (2008),Comedy,will ferrell,Comedy|will ferrell
3,2,89774,5.0,1445715189,Warrior (2011),Drama,Boxing story,Drama|Boxing story
4,2,89774,5.0,1445715189,Warrior (2011),Drama,MMA,Drama|MMA
...,...,...,...,...,...,...,...,...
3470,606,5694,3.0,1172968972,Staying Alive (1983),Comedy|Drama|Musical,70mm,Comedy|Drama|Musical|70mm
3471,606,6107,4.0,1171324428,Night of the Shooting Stars (Notte di San Lore...,Drama|War,World War II,Drama|War|World War II
3472,606,7382,4.5,1171233924,I'm Not Scared (Io non ho paura) (2003),Drama|Mystery|Thriller,for katie,Drama|Mystery|Thriller|for katie
3473,610,3265,5.0,1479542010,Hard-Boiled (Lat sau san taam) (1992),Action|Crime|Drama|Thriller,gun fu,Action|Crime|Drama|Thriller|gun fu


In [6]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

movies_with_tags_ratings['genrestags2'] = [change_string(g) for g in movies_with_tags_ratings['genrestags'].values]

df = movies_with_tags_ratings[['rating' ,'genrestags2']]
df

Unnamed: 0,rating,genrestags2
0,5.0,Comedy funny
1,5.0,Comedy Highlyquotable
2,5.0,Comedy willferrell
3,5.0,Drama Boxingstory
4,5.0,Drama MMA
...,...,...
3471,4.0,Drama War WorldWarII
3472,4.5,Drama Mystery Thriller forkatie
3473,5.0,Action Crime Drama Thriller gunfu
3474,5.0,Action Crime Drama Thriller heroicbloodshed


In [7]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['genrestags2'])

X_train_counts

y_train = df['rating']

y = y_train

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X = X_train_tfidf

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
rmse_val = [] 
for K in range(20):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train)
    pred=model.predict(X_test)
    error = sqrt(mean_squared_error(y_test,pred)) 
    rmse_val.append(error) 
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 1.050280477043999
RMSE value for k=  2 is: 0.9578490780503154
RMSE value for k=  3 is: 0.9055843494940332
RMSE value for k=  4 is: 0.8772932796751713
RMSE value for k=  5 is: 0.8671776787007273
RMSE value for k=  6 is: 0.8532404332363317
RMSE value for k=  7 is: 0.8484222154614307
RMSE value for k=  8 is: 0.8404609425221212
RMSE value for k=  9 is: 0.8366208037307795
RMSE value for k=  10 is: 0.836133943375323
RMSE value for k=  11 is: 0.8363037209923238
RMSE value for k=  12 is: 0.837009968912407
RMSE value for k=  13 is: 0.8342977144428478
RMSE value for k=  14 is: 0.8345317444324092
RMSE value for k=  15 is: 0.8356242456684191
RMSE value for k=  16 is: 0.8393475169562045
RMSE value for k=  17 is: 0.8392829413882814
RMSE value for k=  18 is: 0.8378264620023584
RMSE value for k=  19 is: 0.837394781675658
RMSE value for k=  20 is: 0.8392111315252331


In [9]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [10]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [38]:
df = dataset.groupby(['iid']).agg({'rating': ['var', 'describe'],})
df

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating
Unnamed: 0_level_1,var,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,rating,count,mean,std,min,25%,50%,75%,max
iid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
'71 (2014),,1.0,4.000000,,4.0,4.000,4.0,4.000,4.0
'Hellboy': The Seeds of Creation (2004),,1.0,4.000000,,4.0,4.000,4.0,4.000,4.0
'Round Midnight (1986),0.000000,2.0,3.500000,0.000000,3.5,3.500,3.5,3.500,3.5
'Salem's Lot (2004),,1.0,5.000000,,5.0,5.000,5.0,5.000,5.0
'Til There Was You (1997),2.000000,2.0,4.000000,1.414214,3.0,3.500,4.0,4.500,5.0
...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.647186,22.0,3.863636,0.804479,2.0,3.500,4.0,4.375,5.0
xXx (2002),0.934330,24.0,2.770833,0.966607,0.5,2.375,3.0,3.500,4.0
xXx: State of the Union (2005),0.250000,5.0,2.000000,0.500000,1.5,1.500,2.0,2.500,2.5
¡Three Amigos! (1986),0.891154,26.0,3.134615,0.944009,1.0,2.500,3.0,3.875,5.0


In [46]:
df = df['rating'][['describe', 'var']]

In [69]:
X = df['describe'][['mean', '50%']]
X

Unnamed: 0_level_0,mean,50%
iid,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.000000,4.0
'Hellboy': The Seeds of Creation (2004),4.000000,4.0
'Round Midnight (1986),3.500000,3.5
'Salem's Lot (2004),5.000000,5.0
'Til There Was You (1997),4.000000,4.0
...,...,...
eXistenZ (1999),3.863636,4.0
xXx (2002),2.770833,3.0
xXx: State of the Union (2005),2.000000,2.0
¡Three Amigos! (1986),3.134615,3.0


In [70]:
y = df['describe']['max']
y

iid
'71 (2014)                                   4.0
'Hellboy': The Seeds of Creation (2004)      4.0
'Round Midnight (1986)                       3.5
'Salem's Lot (2004)                          5.0
'Til There Was You (1997)                    5.0
                                            ... 
eXistenZ (1999)                              5.0
xXx (2002)                                   4.0
xXx: State of the Union (2005)               2.5
¡Three Amigos! (1986)                        5.0
À nous la liberté (Freedom for Us) (1931)    1.0
Name: max, Length: 9719, dtype: float64

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [68]:
rmse_val = [] 
for K in range(20):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train)
    pred=model.predict(X_test)
    error = sqrt(mean_squared_error(y_test,pred)) 
    rmse_val.append(error) 
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 0.47978690400117097
RMSE value for k=  2 is: 0.45777182214752293
RMSE value for k=  3 is: 0.44373657620557144
RMSE value for k=  4 is: 0.4354098366172327
RMSE value for k=  5 is: 0.42666401426644696
RMSE value for k=  6 is: 0.42480898905667736
RMSE value for k=  7 is: 0.4192693967802916
RMSE value for k=  8 is: 0.4219095302893396
RMSE value for k=  9 is: 0.422850770328473
RMSE value for k=  10 is: 0.4210494800009299
RMSE value for k=  11 is: 0.4221453903641736
RMSE value for k=  12 is: 0.4225662584712058
RMSE value for k=  13 is: 0.4249961862724386
RMSE value for k=  14 is: 0.4260239913068984
RMSE value for k=  15 is: 0.4277644164446397
RMSE value for k=  16 is: 0.42786304822185034
RMSE value for k=  17 is: 0.4279054776599419
RMSE value for k=  18 is: 0.4303365558054658
RMSE value for k=  19 is: 0.4305363184375889
RMSE value for k=  20 is: 0.4316075112000251
