# Imports

In [42]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, hstack

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Data downloads

In [43]:
links = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/links.csv')
movies = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/movies.csv')
ratings = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/ratings.csv')
tags = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/tags.csv')

# Data transforming

In [44]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [45]:
movies_with_tags_ratings = pd.merge(movies_with_tags, ratings,  how='left', 
                                    left_on=['movieId','userId'], right_on = ['movieId','userId'])

In [46]:
movies_with_tags_ratings.drop(['timestamp_x','timestamp_y'], axis = 1, inplace = True)
movies_with_tags_ratings.dropna(inplace=True)
movies_with_tags_ratings

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,4.0
...,...,...,...,...,...,...
11818,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,4.0
11840,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,3.5
11841,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,3.5
11842,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,3.5


In [47]:
dataset = pd.DataFrame({
    'uid': movies_with_tags_ratings.userId,
    'iid': movies_with_tags_ratings.title,
    'rating': movies_with_tags_ratings.rating
})
dataset.head()

Unnamed: 0,uid,iid,rating
0,336.0,Toy Story (1995),4.0
1,474.0,Toy Story (1995),4.0
2,567.0,Toy Story (1995),3.5
3,62.0,Jumanji (1995),4.0
4,62.0,Jumanji (1995),4.0


In [48]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [49]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5682  0.6232  0.4567  0.5484  0.5580  0.5509  0.0538  
MAE (testset)     0.2224  0.2372  0.1808  0.2088  0.2216  0.2142  0.0190  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.02    0.01    0.01    0.01    0.

{'test_rmse': array([0.5681876 , 0.62324077, 0.45674014, 0.54843656, 0.55804086]),
 'test_mae': array([0.22244758, 0.23716513, 0.18076386, 0.2087854 , 0.2216105 ]),
 'fit_time': (0.01191091537475586,
  0.007492780685424805,
  0.008120059967041016,
  0.006279945373535156,
  0.007419109344482422),
 'test_time': (0.01630115509033203,
  0.013386964797973633,
  0.013036012649536133,
  0.01224970817565918,
  0.01186990737915039)}