<a href="https://colab.research.google.com/github/PFedorov7/CtCI-6th-Edition/blob/master/recsys3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from surprise import SVD, Dataset, Reader, accuracy, KNNBasic, SVDpp, NMF, SlopeOne, CoClustering, BaselineOnly, KNNWithMeans
from surprise.model_selection import cross_validate

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from lenskit.algorithms.bias import Bias
from lenskit.algorithms.basic import Fallback
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.mf_common import MFPredictor
from lenskit.algorithms.als import BiasedMF
from lenskit.algorithms.als import ImplicitMF 
from lenskit.algorithms.tf import IntegratedBiasMF
from lenskit.algorithms.svd import BiasedSVD
from lenskit.algorithms.funksvd import FunkSVD
from lenskit.algorithms.tf import BPR
from lenskit.algorithms import Recommender
from lenskit.metrics.predict import rmse
from lenskit.batch import predict
from lenskit import datasets
from lenskit import batch

In [None]:
df_links = pd.read_csv('data/links.csv')
df_movies = pd.read_csv('data/movies.csv')
df_rs_train = pd.read_csv('data/ratings_train.csv')
df_rs_test = pd.read_csv('data/ratings_test.csv')
df_tags = pd.read_csv('data/tags.csv')

### Data prepararion

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
df_tags = df_tags.rename(columns = {'userId': 'user', 'movieId': 'item'}, inplace = False)
df_movies = df_movies.rename(columns = {'movieId': 'item'}, inplace = False)

In [None]:
le = LabelEncoder()
le.fit(df_movies['title'])

scaler = StandardScaler()

In [None]:
### Train

In [None]:
df_train_tags = df_rs_train.merge(df_tags, on=['user', 'item'], how='left')
df_train_tags_titles = df_train_tags.merge(df_movies, on=['item'], how='left')
df_train = df_train_tags_titles.rename(
    columns = {'timestamp_x': 'timestamp', 
               'timestamp_y': 'timestamp_tags'
              }, inplace = False
)

df_train_encoded = df_train['genres'].str.get_dummies()
df_train = pd.concat([df_train, df_train_encoded], axis = 1)
df_train['title'] = le.transform(df_train['title'])

df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')

df_train['year'] = df_train['datetime'].dt.year
df_train['month'] = df_train['datetime'].dt.month
df_train['day'] = df_train['datetime'].dt.day

df_train = df_train.drop(['genres', 'tag', 'timestamp_tags', 'datetime'], axis = 1)

In [None]:
df_train.head()

Unnamed: 0,user,item,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,month,day
0,139,5464,2.5,1453924404,7191,0,0,0,0,0,...,0,0,0,0,0,0,0,2016,1,27
1,359,3499,3.5,1198114431,5702,0,0,0,0,0,...,0,0,0,0,1,0,0,2007,12,20
2,417,2329,5.0,1530156612,420,0,0,0,0,0,...,0,0,0,0,0,0,0,2018,6,28
3,600,1707,3.0,1237851724,4004,0,0,0,0,1,...,0,0,0,0,0,0,0,2009,3,23
4,229,509,3.0,838144451,6622,0,0,0,0,0,...,0,0,1,0,0,0,0,1996,7,23


In [None]:
### Test

In [None]:
df_test_tags = df_rs_test.merge(df_tags, on=['user', 'item'], how='left')
df_test_tags_titles = df_test_tags.merge(df_movies, on=['item'], how='left')
df_test = df_test_tags_titles.rename(
    columns = {'timestamp_x': 'timestamp', 
               'timestamp_y': 'timestamp_tags'
              }, inplace = False
)

df_test_encoded = df_test['genres'].str.get_dummies()
df_test = pd.concat([df_test, df_test_encoded], axis = 1)
df_test['title'] = le.transform(df_test['title'])

df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
df_test['year'] = df_test['datetime'].dt.year
df_test['month'] = df_test['datetime'].dt.month
df_test['day'] = df_test['datetime'].dt.day

df_test = df_test.drop(['genres', 'tag', 'timestamp_tags', 'datetime'], axis = 1)

In [None]:
df_test.head()

Unnamed: 0,user,item,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,month,day
0,177,82167,3.0,1435837757,5277,0,0,0,0,0,...,0,0,1,0,0,0,0,2015,7,2
1,369,1500,4.5,1237081565,3661,0,0,0,0,0,...,0,0,1,0,0,0,0,2009,3,15
2,381,2140,3.5,1164876960,2164,0,0,1,0,0,...,0,0,0,0,0,0,0,2006,11,30
3,484,48780,1.5,1342229033,6782,0,0,0,0,0,...,0,1,0,1,1,0,0,2012,7,14
4,200,4022,3.5,1229886197,1608,0,0,0,0,0,...,0,0,0,0,0,0,0,2008,12,21


### Build Models

### SVD

In [None]:
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)
trainset = data.construct_trainset(raw_trainset=data.raw_ratings)

data = Dataset.load_from_df(df_test[['user', 'item', 'rating']], reader)
test_set = data.construct_testset(raw_testset=data.raw_ratings)

In [None]:
#0.843736049276373 best
svd = SVDpp(
    n_epochs=36, 
    init_std_dev=0.021,
    init_mean=0.006,
    verbose=True
)
svd.fit(trainset)
predictions = svd.test(test_set)
print(accuracy.rmse(predictions))

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
RMSE: 0.8417
0.8417448880122917


### Item-based k-NN

In [None]:
sim_options = {
    "name": "cosine",
    "min_support": 3,
    "user_based": False,
}

algo = KNNWithMeans(k=20, min_k=10, sim_options=sim_options)
algo.fit(trainset)
predictions = algo.test(test_set)
print(accuracy.rmse(predictions))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8779
0.8778866864637181


### User-based k-NN

In [None]:
algo = UserUser(nnbrs=14, 
                min_nbrs=9, 
                min_sim=0.096131, 
                center=True, 
                aggregate='weighted-average'
               )
algo.fit(df_train[['user','item', 'rating']])
preds = predict(algo, df_test)
print(rmse(preds['prediction'], preds['rating']))

0.8022955067876776


### Tensorflow Biased MF

In [None]:
algo = BiasedMF(0)
algo =  Recommender.adapt(algo)
algo = batch.train_isolated(algo, df_train)
preds = batch.predict(algo, df_test)

In [None]:
rmse(preds['prediction'], preds['rating'])

0.9686798795383659

#### Links
1. http://surpriselib.com/
https://surprise.readthedocs.io/en/stable/matrix_factorization.html
2. https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system
3. https://www.kaggle.com/ashishpatel26/movie-recommendation-of-movie-lens-data-set
4. https://www.kaggle.com/shubhammehta21/movie-lens-small-latest-dataset/notebooks
5. https://lkpy.readthedocs.io/_/downloads/en/latest/pdf/
6. https://realpython.com/build-recommendation-engine-collaborative-filtering/
7. https://towardsdatascience.com/movie-recommendation-system-based-on-movielens-ef0df580cd0e

In [None]:
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)
# df_train[df_train['user'] == 1]
# svd.predict(1, 260, 3)

ToDo
1. ADD Train test split on train
2. Import linear model(xgboost?)
3. Learn it on the predictions and validate on train_test part
4. check stacking score on test