In [262]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [263]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [264]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [265]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [266]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [267]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [268]:
movies_with_tags['movieId_mean'] = [ratings.loc[ratings['movieId']==tweet, 'rating'].mean() for tweet in movies_with_tags.movieId]

In [269]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movieId_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,3.92093
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,3.92093
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,3.431818
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,3.431818


In [270]:
movies_with_tags['userId_mean'] = [ratings.loc[ratings['userId']==tweet, 'rating'].mean() for tweet in movies_with_tags.userId]

In [271]:
movies_with_tags.head(5)

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movieId_mean,userId_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,3.92093,4.321429
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,3.92093,3.398956
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.92093,2.245455
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,3.431818,4.081967
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,3.431818,4.081967


In [272]:
del ratings['timestamp']

In [273]:
movies_with_tags = movies_with_tags.join(ratings.set_index(['movieId', 'userId']), on=['movieId', 'userId'])
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movieId_mean,userId_mean,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,3.92093,4.321429,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,3.92093,3.398956,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.92093,2.245455,3.5
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,3.431818,4.081967,4.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,3.431818,4.081967,4.0


In [274]:
movies_with_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11853 entries, 0 to 9741
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       11853 non-null  int64  
 1   title         11853 non-null  object 
 2   genres        11853 non-null  object 
 3   userId        3683 non-null   float64
 4   tag           3683 non-null   object 
 5   timestamp     3683 non-null   float64
 6   movieId_mean  11832 non-null  float64
 7   userId_mean   3683 non-null   float64
 8   rating        3476 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 926.0+ KB


In [275]:
movies_with_tags = movies_with_tags[pd.isnull(movies_with_tags['rating']) == 0 ]

In [276]:
movies_with_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 9732
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       3476 non-null   int64  
 1   title         3476 non-null   object 
 2   genres        3476 non-null   object 
 3   userId        3476 non-null   float64
 4   tag           3476 non-null   object 
 5   timestamp     3476 non-null   float64
 6   movieId_mean  3476 non-null   float64
 7   userId_mean   3476 non-null   float64
 8   rating        3476 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 271.6+ KB


In [277]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [278]:
movie_genres = [change_string(g) for g in movies_with_tags.genres.values]

In [279]:
movie_genres[:5]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Adventure Children Fantasy']

In [280]:
tfidf = TfidfVectorizer()
movie_genres_tfidf = tfidf.fit_transform(movie_genres)

In [281]:
movie_genres_tfidf

<3476x20 sparse matrix of type '<class 'numpy.float64'>'
	with 9463 stored elements in Compressed Sparse Row format>

In [282]:
def change_string(s):
    return str(s).replace(' ', ' ').replace('-', '').lower()

In [283]:
tags_tag = [change_string(g) for g in movies_with_tags.tag.values]

In [284]:
tags_tag[:5]

['pixar', 'pixar', 'fun', 'fantasy', 'magic board game']

In [285]:
tfidf_1 = TfidfVectorizer()
tags_tag_tfidf = tfidf_1.fit_transform(tags_tag)

In [286]:
tags_tag_tfidf

<3476x1712 sparse matrix of type '<class 'numpy.float64'>'
	with 5067 stored elements in Compressed Sparse Row format>

In [287]:
df_tfidf = pd.DataFrame(data = movie_genres_tfidf.toarray(),columns = tfidf.get_feature_names_out())

In [288]:
df_tfidf

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.379419,0.504921,0.536492,0.292207,0.0,0.0,0.0,0.477376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.000000,0.379419,0.504921,0.536492,0.292207,0.0,0.0,0.0,0.477376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.379419,0.504921,0.536492,0.292207,0.0,0.0,0.0,0.477376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.467149,0.000000,0.660541,0.000000,0.0,0.0,0.0,0.587756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.467149,0.000000,0.660541,0.000000,0.0,0.0,0.0,0.587756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,0.410036,0.460000,0.000000,0.650432,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444081,0.0,0.0,0.0
3472,0.440726,0.000000,0.657975,0.000000,0.380782,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477320,0.0,0.0,0.0
3473,0.440726,0.000000,0.657975,0.000000,0.380782,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477320,0.0,0.0,0.0
3474,0.440726,0.000000,0.657975,0.000000,0.380782,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477320,0.0,0.0,0.0


In [289]:
df_tfidf_1 = pd.DataFrame(data = tags_tag_tfidf.toarray(),columns = tfidf_1.get_feature_names_out())

In [290]:
df_tfidf_1

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,250,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [291]:
del movies_with_tags['title']

In [292]:
del movies_with_tags['genres']

In [293]:
del movies_with_tags['tag']

In [294]:
del movies_with_tags['timestamp']

In [295]:
movies_with_tags = movies_with_tags.reset_index()

In [296]:
movies_with_tags

Unnamed: 0,index,movieId,userId,movieId_mean,userId_mean,rating
0,0,1,336.0,3.920930,4.321429,4.0
1,0,1,474.0,3.920930,3.398956,4.0
2,0,1,567.0,3.920930,2.245455,3.5
3,1,2,62.0,3.431818,4.081967,4.0
4,1,2,62.0,3.431818,4.081967,4.0
...,...,...,...,...,...,...
3471,9710,187595,62.0,3.900000,4.081967,4.0
3472,9732,193565,184.0,3.500000,3.705224,3.5
3473,9732,193565,184.0,3.500000,3.705224,3.5
3474,9732,193565,184.0,3.500000,3.705224,3.5


In [297]:
del movies_with_tags['index']

In [298]:
result = pd.concat([movies_with_tags, df_tfidf, df_tfidf_1], axis=1, join='inner')

In [299]:
result

Unnamed: 0,movieId,userId,movieId_mean,userId_mean,rating,action,adventure,animation,children,comedy,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,336.0,3.920930,4.321429,4.0,0.000000,0.379419,0.504921,0.536492,0.292207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,474.0,3.920930,3.398956,4.0,0.000000,0.379419,0.504921,0.536492,0.292207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,567.0,3.920930,2.245455,3.5,0.000000,0.379419,0.504921,0.536492,0.292207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,62.0,3.431818,4.081967,4.0,0.000000,0.467149,0.000000,0.660541,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,62.0,3.431818,4.081967,4.0,0.000000,0.467149,0.000000,0.660541,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,187595,62.0,3.900000,4.081967,4.0,0.410036,0.460000,0.000000,0.650432,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3472,193565,184.0,3.500000,3.705224,3.5,0.440726,0.000000,0.657975,0.000000,0.380782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3473,193565,184.0,3.500000,3.705224,3.5,0.440726,0.000000,0.657975,0.000000,0.380782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3474,193565,184.0,3.500000,3.705224,3.5,0.440726,0.000000,0.657975,0.000000,0.380782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [300]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # линейная регрессия

In [302]:
Y=result['rating']
Y

0       4.0
1       4.0
2       3.5
3       4.0
4       4.0
       ... 
3471    4.0
3472    3.5
3473    3.5
3474    3.5
3475    3.5
Name: rating, Length: 3476, dtype: float64

In [303]:
del result['rating']

In [304]:
X_train, X_test, y_train, y_test = train_test_split(result, Y, test_size=0.3, random_state=42)

In [305]:
from sklearn.metrics import mean_squared_error

In [307]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

LinearRegression()

In [309]:
y_train_predict = lin_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_train_predict))

0.4739304492899495

In [310]:
y_test_predict = lin_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_test_predict))

0.7812059668655607