<a href="https://colab.research.google.com/github/StetskoSergey/pida5_stetsko/blob/master/aml6%20dz%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

In [0]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

### делаем Построить рекомендации (регрессия, предсказываем оценку) на фичах:
TF-IDF на тегах и жанрах
Средние оценки (+ median, variance, etc.) пользователя и фильма
Оценить RMSE на тестовой выборке ###

In [0]:
movies_with_tags = movies[['movieId','genres']].join(tags[['movieId','tag']].set_index('movieId'), on='movieId').drop_duplicates()

In [13]:
movies_with_tags.head()

Unnamed: 0,movieId,genres,tag
0,1,Adventure|Animation|Children|Comedy|Fantasy,pixar
0,1,Adventure|Animation|Children|Comedy|Fantasy,fun
1,2,Adventure|Children|Fantasy,fantasy
1,2,Adventure|Children|Fantasy,magic board game
1,2,Adventure|Children|Fantasy,Robin Williams


In [6]:
movies_with_tags.shape

(11749, 3)

In [0]:
# формируем tfidf по жанрам
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

movie_genres = [change_string(g) for g in movies_with_tags.genres.values]

tfidf_vectorizer_genre = TfidfVectorizer()
X_train_tfidf_genre = tfidf_vectorizer_genre.fit_transform(movie_genres)    

In [0]:
# Формируем tfidf по тэгам
def change_string_tag(s):
    return str(s).replace(' ', '').replace('-', '')

movie_tags = [change_string_tag(g) for g in movies_with_tags.tag.values]

tfidf_vectorizer_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_vectorizer_tag.fit_transform(movie_tags)    

In [0]:
# Все сливаем в одну матрицу
genre = X_train_tfidf_genre.toarray()
tag = X_train_tfidf_tag.toarray()
for x in range(genre.shape[1]):
    col_name = 'g{}'.format(x)
    movies_with_tags[col_name] = pd.Series(genre[:, x])
for x in range(tag.shape[1]):
    col_name = 'g{}'.format(x)
    movies_with_tags[col_name] = pd.Series(tag[:, x])

In [8]:
# Добавляем оценки
movies_with_tags = movies_with_tags.merge(ratings, left_on='movieId', right_on='movieId')
# Удаляем не нужные столцы
to_train = movies_with_tags.drop(['movieId','genres','tag','userId', 'timestamp'], axis=1)
to_train.dropna(inplace = True)
to_train.head()

Unnamed: 0,g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10,g11,g12,g13,g14,g15,g16,g17,g18,g19,g20,g21,g22,g23,g24,g25,g26,g27,g28,g29,g30,g31,g32,g33,g34,g35,g36,g37,g38,g39,...,g1434,g1435,g1436,g1437,g1438,g1439,g1440,g1441,g1442,g1443,g1444,g1445,g1446,g1447,g1448,g1449,g1450,g1451,g1452,g1453,g1454,g1455,g1456,g1457,g1458,g1459,g1460,g1461,g1462,g1463,g1464,g1465,g1466,g1467,g1468,g1469,g1470,g1471,g1472,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5


Предсказываем рейтинг по фичам фильмов

In [0]:
train, test = train_test_split(to_train, test_size =0.2)
x_train = train.drop(['rating'], axis = 1)
y_train = train['rating']
x_test = test.drop(['rating'], axis = 1)
y_test = test['rating']

In [10]:
algo = LinearRegression().fit(x_train, y_train)
algo.predict(x_test)

array([3.88901927, 3.88901927, 3.88901927, ..., 2.98608496, 3.88901927,
       3.36426212])

In [11]:
algo.score(x_test,y_test)

-4.39791515621278e+16

Средние оценки (+ median, variance, etc.) пользователя и фильма

In [25]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [27]:
# Средние оценки Пользователей
ratings.groupby('userId').rating.mean()

userId
1      4.366379
2      3.948276
3      2.435897
4      3.555556
5      3.636364
         ...   
606    3.657399
607    3.786096
608    3.134176
609    3.270270
610    3.688556
Name: rating, Length: 610, dtype: float64

In [28]:
# Средние оценки Фильмов
ratings.groupby('movieId').rating.mean()

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

In [29]:
# + median
ratings.groupby('movieId').rating.median()

movieId
1         4.0
2         3.5
3         3.0
4         3.0
5         3.0
         ... 
193581    4.0
193583    3.5
193585    3.5
193587    3.5
193609    4.0
Name: rating, Length: 9724, dtype: float64

In [32]:
# variance
ratings.groupby('userId').rating.var()

userId
1      0.640077
2      0.649015
3      4.370783
4      1.727132
5      0.980973
         ...   
606    0.524351
607    0.932494
608    1.164807
609    0.202703
610    0.735173
Name: rating, Length: 610, dtype: float64

Оценить RMSE на тестовой выборке

In [0]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [14]:
reader = Reader(rating_scale=(0.5, 5))
dataset = Dataset.load_from_df(ratings[['movieId','userId','rating']], reader)
trainset, testset = train_test_split(dataset, test_size=0.2)
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9844


0.9844436518137036