In [142]:
import pandas as pd
import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import mean_squared_error

import scipy.sparse as sparse

from tqdm import tqdm_notebook

from lightfm import LightFM

from implicit.als import AlternatingLeastSquares

from surprise import KNNWithMeans, Reader, Dataset, accuracy, SVD,KNNBaseline, SVDpp

## Соберу Бэггинг из всех рекомендательных алгоритмов ( а вдруг прокатит):
Будут использованы: 
    
- NearestNeighbors
- LogisticRegression
- AlternatingLeastSquares
- surprise.KNNWithMeans, 
- surprise.SVD
- surprise.KNNBaseline
- surprise.SVDpp

Разобъем выборку на трейн и валидационную. На валидационной проверим результат Бэггинга по **MSE**.  
 
Самый лучший результат одного алгоритма (SVD) = .87.  

Посмотрим сможем ли мы его обойти

In [3]:
ratings = pd.read_csv('ratings.csv')
ratings.drop(columns='timestamp', inplace=True)
ratings.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0


In [4]:
movies = pd.read_csv('movies.csv')

In [5]:
ratings_movies = ratings.join(movies.set_index('movieId'), on='movieId')

In [6]:
ratings_movies.head(2)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance


In [7]:
genre = []
for i in ratings_movies.genres:
    genre += i.split('|')

genre = set(genre)
count_vect = CountVectorizer()
count_vect.fit(genre)

CountVectorizer()

In [8]:
X_train, X_test, y_train, y_test = train_test_split( ratings_movies[[ 'userId','movieId','genres'] ], ratings_movies.rating, 
                                                    test_size =.3, random_state = 255 )

#### Logit

In [43]:
#Train
genres_grouped_by_user = []
rating_grouped_by_user = []

for user_id,dataset in X_train.groupby(by='userId'):
    genres_grouped_by_user.append( [i.replace('|',' ') for i in dataset.genres.values ]   )
    rating_grouped_by_user.append( y_train[dataset.index.values].values)


In [44]:
set_for_each_user = dict()

for i in range(len(genres_grouped_by_user)):
    tfidf_transformer = TfidfTransformer()
    
    X_genres_sparse = count_vect.transform(genres_grouped_by_user[i])
    
    X_genres_sparse_tfidf = tfidf_transformer.fit_transform(X_genres_sparse)

    if len( set( rating_grouped_by_user[i] ) ) == 1: # пользователь всегда ставил одну и ту же оценку 
        neigh = NearestNeighbors(n_neighbors=5, n_jobs=-1, metric='manhattan') 
        neigh.fit(X_genres_sparse_tfidf.toarray())
        algo = neigh
        algo_type = 'knn'
    else:
        # Логит 
        logit = LogisticRegression(solver = 'saga', multi_class='multinomial')
        
        logit.fit(X_genres_sparse_tfidf.toarray() ,(rating_grouped_by_user[i]*2).astype(int) )
        algo = logit
        algo_type = 'logit'

    set_for_each_user[i+1] = {'tf_idf_transformer':tfidf_transformer,
                           'algoritm': algo,
                           'algoritm_type':algo_type }


In [45]:
#Test
genres_grouped_by_user_test = []
rating_grouped_by_user_test = []

for user_id,dataset in X_test.groupby(by='userId'):
    genres_grouped_by_user_test.append( [i.replace('|',' ') for i in dataset.genres.values ]   )
    rating_grouped_by_user_test.append( y_test[dataset.index.values].values)


In [46]:
# проверим результативность только одного логита
MSE_logit = 0.

for i in range(len(genres_grouped_by_user_test)):
    if set_for_each_user[i+1]['algoritm_type'] != 'logit': continue
    
    tfidf_transformer = set_for_each_user[i+1]['tf_idf_transformer']

    X_genres_sparse = count_vect.transform(genres_grouped_by_user_test[i])

    X_genres_sparse_tfidf = tfidf_transformer.fit_transform(X_genres_sparse)
    
    logit = set_for_each_user[i+1]['algoritm']
    y_pred = logit.predict(X_genres_sparse_tfidf.toarray())
    y_pred = y_pred/2

    MSE_logit += mean_squared_error(rating_grouped_by_user_test[i],y_pred )

print('root_MSE for logit = %.2f ' %(MSE_logit**.5))

root_MSE for logit = 28.23 


In [47]:
movie_genres = dict()

for mv,gn in movies[['movieId','genres' ]].itertuples(index=False) :
    movie_genres[mv] = gn.replace('|',' ')

In [48]:
def logit_predict(uid,iid):
    if set_for_each_user[uid]['algoritm_type'] != 'logit': return 0
    
    tfidf_transformer = set_for_each_user[uid]['tf_idf_transformer']

    X_genres_sparse = count_vect.transform([movie_genres[iid]])

    X_genres_sparse_tfidf = tfidf_transformer.fit_transform(X_genres_sparse)
    
    logit = set_for_each_user[uid]['algoritm']
    y_pred = logit.predict(X_genres_sparse_tfidf.toarray())
    
    return y_pred/2

In [49]:
logit_predict(1,1)

array([5.])

#### SVD

In [33]:
df = pd.DataFrame( {'uid': X_train.userId, 'iid':X_train.movieId,'rating':y_train})

In [34]:
reader = Reader( rating_scale=(.5,5))

In [35]:
surprise_df = Dataset.load_from_df(df,reader)

In [36]:
surprise_df = surprise_df.build_full_trainset()

In [37]:
svd = SVD(n_factors=20, n_epochs=30)
svd.fit(surprise_df)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0xf23e310>

In [38]:
svd.predict(1,1)

Prediction(uid=1, iid=1, r_ui=None, est=4.771670758762139, details={'was_impossible': False})

#### ALS

In [None]:
als = AlternatingLeastSquares(factors=20, regularization=.5)

In [79]:
users = list( X_train.userId.astype(int) )
movi = list( X_train.movieId.astype(int) )

#a[row_ind[k], col_ind[k]] = data[k]
sparse_user_movie = sparse.csr_matrix( ( list(y_train) , (users , movi)  )  )
sparse_movie_user = sparse.csr_matrix( ( list(y_train) , (movi, users )  )  )

In [80]:
alpha = 10
sparse_dataset = sparse_movie_user * alpha
als.fit(sparse_dataset)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [81]:
user_vector = als.user_factors
movie_vector = als.item_factors

In [82]:
movie_vector[1].shape

(20,)

In [83]:
user_vector.shape

(611, 20)

In [84]:
movie_vector[3441]@user_vector[1]

0.85795945

In [85]:
als.rank_items(1,sparse_user_movie,[3441])

[(3441, 0.85795945)]

In [31]:
ids_scores = als.recommend(1,sparse_user_movie)
ids = [i[0] for i in ids_scores]

In [78]:
als.rank_items(1,sparse_user_movie,ids)

[(1059, 1.4945408),
 (1805, 1.4942952),
 (1918, 1.4907887),
 (1047, 1.40338),
 (1673, 1.3968397),
 (4571, 1.3612853),
 (3256, 1.301303),
 (1407, 1.2929931),
 (494, 1.2883558),
 (708, 1.2742217)]

#### PS . Похоже Единственное что умеет implicit ALS - это найти рекомендации похожих. Но если нужно предсказать оценку тут не  справляется

#### ALS surprise

In [113]:
svd_implicit = SVDpp(n_factors=20, n_epochs=30, lr_all = 0.007, reg_all = .02)

In [114]:
svd_implicit.fit(surprise_df)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x120a2490>

In [119]:
svd_implicit.predict(1,1).est

4.3600181788570636

#### KNN Surprise

In [104]:
bsl_options = {'method': 'als',
                'reg': 2,
                'reg_i':2,
                'reg_u':2,
                'n_epochs':15}

sim_options = {'name': 'pearson_baseline',
                'user_based': False}

In [109]:
knn = KNNBaseline(k=30 , min_k=1 , sim_options=sim_options, bsl_options=bsl_options )

In [110]:
knn.fit(surprise_df)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x164a0ac0>

In [111]:
knn.predict(1,1).est

4.227795098062775

#### СОберем Беггинг из Алгоритмов

In [139]:
def bag_predict(uid,iid):
    results = np.array([])
    results = np.append(results, round(svd.predict(uid,iid).est,0) )
    results = np.append(results, logit_predict(uid,iid) )
    results = np.append(results, round(knn.predict(uid,iid).est,0) )
    results = np.append(results, round(svd_implicit.predict(uid,iid).est,0) )
    scores, counts = np.unique(results,return_counts=True)
    return scores[np.argsort(-counts)][0]

#### Предскажем на тестовой выборке и посчитаем MSE

In [137]:
test_df = X_test.join(y_test).drop(columns='genres')
test_df.head(2)

Unnamed: 0,userId,movieId,rating
9622,64,1968,3.5
62388,414,210,3.0


In [143]:
y_pred = []
for user,movie in tqdm_notebook (test_df[['userId','movieId']].itertuples(index=False)):
    y_pred.append(bag_predict(user,movie))

test_df['rating_pred_bagging'] = y_pred

test_df.head(2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user,movie in tqdm_notebook (test_df[['userId','movieId']].itertuples(index=False)):


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




Unnamed: 0,userId,movieId,rating,rating_pred_bagging
9622,64,1968,3.5,4.0
62388,414,210,3.0,3.0


In [145]:
mean_squared_error(test_df.rating , test_df.rating_pred_bagging)**.5

0.8163366500280983

### Вывод:

- Самый лучший результат на 100Кб датасете: SVD выдал rootMSE = **.87**
- Если собрать несколько алгоритмов в Бэггинг, то результат rootMSE = **.81**


#### Собрем рекомендательный алгоритм на основе Бэггинга:

In [162]:
def get_recomendation(user, k =5):
    
    recommendation = []

    for movie in tqdm_notebook( movies.movieId.unique() ):
        y_pred = bag_predict(user,movie)
        if y_pred == 5: recommendation.append(movie)
        if len(recommendation) == k: break

    titles = [movies[movies.movieId==i].title.iloc[0] for i in recommendation]
    return recommendation,titles

In [164]:
get_recomendation(10,5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie in tqdm_notebook( movies.movieId.unique() ):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9742.0), HTML(value='')))




([136020, 140110, 140725],
 ['Spectre (2015)', 'The Intern (2015)', 'Cop Car (2015)'])