# Imports

In [30]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, hstack

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Data downloads

In [2]:
links = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/links.csv')
movies = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/movies.csv')
ratings = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/ratings.csv')
tags = pd.read_csv('/Users/user/Desktop/netology/rs/ml-latest-small/tags.csv')

In [3]:
# movies.head()
# ratings.head()
# links.head()
# tags.head()

# Data understanding

In [4]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [5]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:5]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy']

In [6]:
all_genres = reduce(lambda x, y: x + y, [i.split(' ') for i in movie_genres])
all_uniqe_genres = {x:all_genres.count(x) for x in all_tags}

NameError: name 'all_tags' is not defined

In [None]:
all_uniqe_genres

In [None]:
sorted(all_uniqe_genres.items(), key=lambda x: x[1])

In [7]:
tags['tag'].value_counts()

In Netflix queue     131
atmospheric           36
superhero             24
thought-provoking     24
surreal               23
                    ... 
Anthony Hopkins        1
Homeless               1
Backwards. memory      1
Western                1
pulp                   1
Name: tag, Length: 1589, dtype: int64

In [8]:
tags['tag'].apply(lambda x: x.lower()).value_counts()

in netflix queue     131
atmospheric           41
superhero             24
surreal               24
funny                 24
                    ... 
mountain climbing      1
freedom                1
way too long           1
e. m. forster          1
istanbul               1
Name: tag, Length: 1475, dtype: int64

# Data transforming

In [9]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [10]:
movies_with_tags_ratings = pd.merge(movies_with_tags, ratings,  how='left', 
                                    left_on=['movieId','userId'], right_on = ['movieId','userId'])


In [11]:
movies_with_tags_ratings.drop(['timestamp_x','timestamp_y'], axis = 1, inplace = True)
movies_with_tags_ratings.dropna(inplace=True)
movies_with_tags_ratings

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,4.0
...,...,...,...,...,...,...
11818,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,4.0
11840,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,3.5
11841,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,3.5
11842,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,3.5


# Feature engineering

In [12]:
df = movies_with_tags_ratings[['rating','userId']].groupby('userId').describe().reset_index(col_level=1)
df.columns = ['userId', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
df.drop(['25%', '50%', '75%','std'], axis = 1, inplace = True)

In [13]:
df = pd.merge(movies_with_tags_ratings, df,  how='left', 
                                    left_on=['userId'], right_on = ['userId'])

df

Unnamed: 0,movieId,title,genres,userId,tag,rating,count,mean,min,max
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0,9.0,3.777778,2.5,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0,1414.0,3.701909,0.5,5.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.5,432.0,3.917824,0.5,5.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,4.0,370.0,3.937838,1.0,5.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,4.0,370.0,3.937838,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,4.0,370.0,3.937838,1.0,5.0
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,3.5,35.0,3.985714,3.5,5.0
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,3.5,35.0,3.985714,3.5,5.0
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,3.5,35.0,3.985714,3.5,5.0


In [14]:
df_with_metrics = df[['title','userId','rating','count','mean', 'min', 'max']].drop_duplicates().\
    reset_index()
df_with_metrics.drop('index', axis = 1, inplace = True)
df_with_metrics

Unnamed: 0,title,userId,rating,count,mean,min,max
0,Toy Story (1995),336.0,4.0,9.0,3.777778,2.5,4.5
1,Toy Story (1995),474.0,4.0,1414.0,3.701909,0.5,5.0
2,Toy Story (1995),567.0,3.5,432.0,3.917824,0.5,5.0
3,Jumanji (1995),62.0,4.0,370.0,3.937838,1.0,5.0
4,Jumanji (1995),474.0,3.0,1414.0,3.701909,0.5,5.0
...,...,...,...,...,...,...,...
1630,Game Night (2018),62.0,4.0,370.0,3.937838,1.0,5.0
1631,Tomb Raider (2018),62.0,3.5,370.0,3.937838,1.0,5.0
1632,Deadpool 2 (2018),62.0,4.0,370.0,3.937838,1.0,5.0
1633,Solo: A Star Wars Story (2018),62.0,4.0,370.0,3.937838,1.0,5.0


In [15]:
tag_strings = []
movies = []
genres_strings = []


for movie, group in tqdm(df.groupby(['title','userId'])):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '').lower() for s in group.tag.values]))
    genres_strings.append(' '.join([str(s).replace(' ', '').replace('-', '').replace('|', ' ').lower() for s in group.genres.values]))
    movies.append(movie)

HBox(children=(IntProgress(value=0, max=1635), HTML(value='')))




In [16]:
tag_strings[:5]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel',
 'lawyers',
 'creepy suspense',
 'shakespearesortof',
 'dogs remake']

# Modeling on tag_strings, genres_strings and num_metrics

In [25]:
vect_word = TfidfVectorizer(max_features=25, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)

tr_vect = vect_word.fit_transform(tag_strings,genres_strings)
tr_vect


<1635x25 sparse matrix of type '<class 'numpy.float32'>'
	with 540 stored elements in Compressed Sparse Row format>

In [26]:
res_df = pd.DataFrame.sparse.from_spmatrix(tr_vect)
res_df = pd.concat([res_df, df_with_metrics[['rating','count','mean', 'min', 'max']]], axis = 1)
res_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,rating,count,mean,min,max
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.701440,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,9.0,3.777778,2.5,4.5
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1414.0,3.701909,0.5,5.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3.5,432.0,3.917824,0.5,5.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1414.0,3.701909,0.5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
1631,0.0,0.0,0.0,0.0,0.0,0.718709,0.0,0.695311,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,370.0,3.937838,1.0,5.0
1632,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
1633,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0


In [27]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(res_df.drop(columns = ['rating']),  
                                                    res_df[['rating']], 
                                                    test_size=0.15, random_state = 10)

In [31]:
lr = Lasso(alpha=10, random_state=0)
lr.fit(X_train, y_train)

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)

print(mean_squared_error(y_test, lr.predict(X_test)))
print(mean_squared_error(y_test, rf.predict(X_test)))


0.5998246677732608
0.5108562171315599


# Modeling on genres_strings and num_metrics

In [32]:
vect_word = TfidfVectorizer(max_features=2000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)



tr_vect = vect_word.fit_transform(df['genres'])
tr_vect


<3476x358 sparse matrix of type '<class 'numpy.float32'>'
	with 20648 stored elements in Compressed Sparse Row format>

In [33]:
res_df = pd.DataFrame.sparse.from_spmatrix(tr_vect)
res_df = pd.concat([res_df, df[['rating','count','mean', 'min', 'max']]], axis = 1)
res_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,353,354,355,356,357,rating,count,mean,min,max
0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,9.0,3.777778,2.5,4.5
1,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1414.0,3.701909,0.5,5.0
2,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,432.0,3.917824,0.5,5.0
3,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
4,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,0.135768,0.187199,0.0,0.38036,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,370.0,3.937838,1.0,5.0
3472,0.149057,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,35.0,3.985714,3.5,5.0
3473,0.149057,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,35.0,3.985714,3.5,5.0
3474,0.149057,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,35.0,3.985714,3.5,5.0


In [34]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(res_df.drop(columns = ['rating']),  
                                                    res_df[['rating']], 
                                                    test_size=0.2, random_state = 10)

In [35]:
lr = Lasso(alpha=10, random_state=0)
lr.fit(X_train, y_train)

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)

print(mean_squared_error(y_test, lr.predict(X_test)))
print(mean_squared_error(y_test, rf.predict(X_test)))

0.607483826791404
0.30691289495050394
