# Library import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import os

%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
import sklearn
from sklearn.metrics import mean_squared_error

# IDK why I used this library that time 
import surprise
from surprise import Dataset, Reader
from surprise import evaluate, accuracy
from surprise import SVD, NMF, SlopeOne, KNNBasic
from surprise import model_selection # import cross_validate, KFold

# Data loading and previewing

In [2]:
data_path = "D:\Data\ml-20m"

## Ratings.csv

In [3]:
# load ratings
ratings = pd.read_csv(os.path.join(data_path, "ratings.csv"))

print('ratings shape:', ratings.shape)
ratings.head()

ratings shape: (20000263, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
print('unique users:', ratings['userId'].unique().shape[0])

unique users: 138493


In [5]:
print('unique movies:', ratings['movieId'].unique().shape[0])

unique movies: 26744


In [6]:
# check if there are missing values and na's
display('na in ratings\n', ratings.isna().sum())
display('\nnull values in ratings\n', ratings.isnull().sum())

'na in ratings\n'

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

'\nnull values in ratings\n'

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [7]:
# users who watched the most movies
ratings["userId"].value_counts().head(10)

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
74142     5447
34576     5356
131904    5330
83090     5169
59477     4988
Name: userId, dtype: int64

In [8]:
# users who watched the least number of movies
ratings["userId"].value_counts().tail(10)

138322    20
89146     20
92146     20
105507    20
18290     20
59390     20
23558     20
34668     20
80291     20
58028     20
Name: userId, dtype: int64

As declared, there are only users with at least 20 reviews.

## Movies.csv

In [9]:
# load movies
movies = pd.read_csv(os.path.join(data_path, "movies.csv"))

print('movies shape:', movies.shape)
movies.head()

movies shape: (27278, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


As we see from the number of movies in the movies table, not all movies are presented in the ratings table (probably because of filtering)

In [10]:
# check if there are missing values and na's
display('na in movies\n', movies.isna().sum())
display('\nnull values in movies\n', movies.isnull().sum())

'na in movies\n'

movieId    0
title      0
genres     0
dtype: int64

'\nnull values in movies\n'

movieId    0
title      0
genres     0
dtype: int64

## EDA

In [11]:
%%time

ratings_titles = pd.merge(ratings, movies, on='movieId')
ratings_titles.head(3)

Wall time: 4.52 s


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,1112486027,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,851527569,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,849082742,Jumanji (1995),Adventure|Children|Fantasy


In [12]:
ratings_titles.title.value_counts()[:25]

Pulp Fiction (1994)                                                               67310
Forrest Gump (1994)                                                               66172
Shawshank Redemption, The (1994)                                                  63366
Silence of the Lambs, The (1991)                                                  63299
Jurassic Park (1993)                                                              59715
Star Wars: Episode IV - A New Hope (1977)                                         54502
Braveheart (1995)                                                                 53769
Terminator 2: Judgment Day (1991)                                                 52244
Matrix, The (1999)                                                                51334
Schindler's List (1993)                                                           50054
Toy Story (1995)                                                                  49695
Fugitive, The (1993)            

In [13]:
# top-15 movies with the highest average ratings
# in case if film has at least 1000 ratings
movie_stats = ratings_titles.groupby('title').agg({'rating': [np.size, np.mean]})
atleast_1000 = movie_stats['rating']['size'] >= 1000
movie_stats[atleast_1000].sort_values([('rating', 'mean')], ascending=False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Shawshank Redemption, The (1994)",63366.0,4.44699
"Godfather, The (1972)",41355.0,4.364732
"Usual Suspects, The (1995)",47006.0,4.334372
Schindler's List (1993),50054.0,4.310175
"Godfather: Part II, The (1974)",27398.0,4.275641
Seven Samurai (Shichinin no samurai) (1954),11611.0,4.27418
Rear Window (1954),17449.0,4.271334
Band of Brothers (2001),4305.0,4.263182
Casablanca (1942),24349.0,4.258327
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),6525.0,4.256935


If the coldstart problem will exist, we could recommend users to watch the most popular movies 

## Preprocessing

In [26]:
# sort ratings by time to make smt like TimeSeriesSplit
sorted_ratings = ratings.sort_values(by='timestamp')

train_ratings, test_ratings = sklearn.model_selection.train_test_split(sorted_ratings, 
                                                                       test_size=0.2, 
                                                                       shuffle=False)

train_dataset = Dataset.load_from_df(
    train_ratings[['userId', 'movieId', 'rating']],
    reader=Reader(line_format='user item rating'))

test_dataset = Dataset.load_from_df(
    test_ratings[['userId', 'movieId', 'rating']],
    reader=Reader(line_format='user item rating'))

In [27]:
# raw_ratings = sorted_ratings_data.raw_ratings



# # sorted_ratings_data.raw_ratings = train_raw_ratings  # data is now the set A

## Evaluation

In [28]:
def algo_evaluating(algorithm, train_dataset, test_dataset, n_splits=5):
    kf = model_selection.KFold(n_splits=n_splits, shuffle=False)

#     rmse_list = []
#     mae_list = []
#     for trainset, testset in tqdm(kf.split(train_dataset), total=n_splits):
#         algorithm.fit(trainset)
#         predictions = algorithm.test(testset)
#         rmse_list.append(accuracy.rmse(predictions, verbose=True))
#         mae_list.append(accuracy.mae(predictions, verbose=True))

#     print("\nRMSE mean:{:0.3f}\n      std:{:0.3f}".format(
#         np.mean(rmse_list), np.std(rmse_list)))
#     print("\nMAE mean:{:0.3f}\n      std:{:0.3f}".format(
#         np.mean(mae_list), np.std(mae_list)))

    model_selection.cross_validate(algorithm, 
                                   train_dataset,
                                   measures=['RMSE', 'MAE'], 
                                   cv=kf, 
                                   verbose=True,
                                   n_jobs=-1)

    # Compute unbiased accuracy on test
#     testset = train_dataset.construct_testset(test_dataset) 
#     predictions = algorithm.test(testset)
    predictions = algorithm.test(test_dataset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae

# Collaborative filtering 

## Model based approach

### SVD

In [None]:
%%time
algo = SVD(verbose=True)
SVD_rmse, SVD_mae = algo_evaluating(algo,
                                    train_dataset,
                                    test_dataset)

### NMF

In [0]:
%%time
algo = NMF()
NMF_rmse, NMF_mae = algo_evaluating(algo)

1 -fold
RMSE: 1.0229
MAE:  0.8517
2 -fold
RMSE: 1.0790
MAE:  0.8932
3 -fold
RMSE: 1.0144
MAE:  0.8062
4 -fold
RMSE: 0.9793
MAE:  0.7453
5 -fold
RMSE: 1.0059
MAE:  0.7722

RMSE mean:1.020
      std:0.033

MAE mean:0.814
      std:0.053

Unbiased scores:
RMSE: 1.0158
MAE:  0.7865
Wall time: 1h 8min 51s


### SlopeOne

In [0]:
%%time
algo = SlopeOne()
Slope_rmse, Slope_mae = algo_evaluating(algo)

1 -fold
RMSE: 1.0216
MAE:  0.8503
2 -fold
RMSE: 1.0747
MAE:  0.8887
3 -fold
RMSE: 1.0099
MAE:  0.8011
4 -fold
RMSE: 0.9757
MAE:  0.7407
5 -fold
RMSE: 1.0049
MAE:  0.7713

RMSE mean:1.017
      std:0.032

MAE mean:0.810
      std:0.053

Unbiased scores:
RMSE: 1.0156
MAE:  0.7863
Wall time: 49min 40s


In [0]:
model_based_results = pd.DataFrame({
    'Slope One': [Slope_rmse, Slope_rmse],
    'NMF': [NMF_rmse, NMF_mae],
    'SVD': [SVD_rmse, SVD_mae]
})
model_based_results.rename({0: "RMSE"}, axis='index', inplace=True)
model_based_results.rename({1: "MAE"}, axis='index', inplace=True)

os.chdir(HW_path)
model_based_results.to_csv("model_based_results", sep='\t')

In [0]:
#os.chdir(HW_path)
#model_based_results = pd.read_csv('model_based_results', sep='\t', index_col=0)
model_based_results

Unnamed: 0,Slope One,NMF,SVD
RMSE,1.015577,1.015824,0.951603
MAE,1.015577,0.78655,0.731323


## Memory based approach

### Item-based

In [0]:
%%time
sim_options = {
    'name': 'cosine',
    'user_based': False
}
algo = KNNBasic(k=50, sim_options=sim_options, verbose=False)
Item_KNN_rmse, Item_KNN_mae = algo_evaluating(algo)

1 -fold
RMSE: 1.0338
MAE:  0.8593
2 -fold
RMSE: 1.1074
MAE:  0.9153
3 -fold
RMSE: 1.0565
MAE:  0.8373
4 -fold
RMSE: 1.0186
MAE:  0.7749
5 -fold
RMSE: 1.0154
MAE:  0.7803

RMSE mean:1.046
      std:0.034

MAE mean:0.833
      std:0.052

Unbiased scores:
RMSE: 1.0179
MAE:  0.7882
Wall time: 1h 39min 22s


In [0]:
model_based_results = model_based_results.assign(Item_KNN = [Item_KNN_rmse, Item_KNN_mae])

os.chdir(HW_path)
model_based_results.to_csv("model_based_results", sep='\t')

In [0]:
#os.chdir(HW_path)
#model_based_results = pd.read_csv('model_based_results', sep='\t', index_col=0)
model_based_results

Unnamed: 0,Slope One,NMF,SVD,Item_KNN
RMSE,1.015577,1.015824,0.951603,1.017933
MAE,1.015577,0.78655,0.731323,0.788244


### User-based

Due to problems with memory we had to leave only 5% of dataset

In [0]:
%%time
sorted_ratings = sorted_ratings[:1000000]  # small subsample for testing
reader = Reader()
sorted_ratings_data = Dataset.load_from_df(
    sorted_ratings[['userId', 'movieId', 'rating']], reader)
raw_ratings = sorted_ratings_data.raw_ratings
# A = 80% of the data, B = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]
sorted_ratings_data.raw_ratings = train_raw_ratings  # data is now the set A

sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(k=50, sim_options=sim_options, verbose=False)
User_KNN_rmse, User_KNN_mae = algo_evaluating(algo)

1 -fold
RMSE: 1.0219
MAE:  0.8517
2 -fold
RMSE: 0.9974
MAE:  0.8221
3 -fold
RMSE: 0.9897
MAE:  0.8172
4 -fold
RMSE: 0.9788
MAE:  0.8112
5 -fold
RMSE: 0.9799
MAE:  0.8169

RMSE mean:0.994
      std:0.016

MAE mean:0.824
      std:0.014

Unbiased scores:
RMSE: 0.9791
MAE:  0.8158
Wall time: 34min 18s


In [0]:
model_based_results = model_based_results.assign(User_KNN = [User_KNN_rmse, User_KNN_mae])

os.chdir(HW_path)
model_based_results.to_csv("model_based_results", sep='\t')

In [0]:
#os.chdir(HW_path)
#model_based_results = pd.read_csv('model_based_results', sep='\t', index_col=0)
model_based_results

Unnamed: 0,Slope One,NMF,SVD,Item_KNN,User_KNN
RMSE,1.015577,1.015824,0.951603,1.017933,0.979101
MAE,1.015577,0.78655,0.731323,0.788244,0.815786


# Content-based recommender

## Preprocessing

In [0]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [0]:
# split genres
movies_genres = movies
movies_genres.genres = movies_genres.genres.str.split('|')
movies_genres = explode(movies_genres, ['genres'])
movies_genres.head(8)

# drop movie's title
movies_genres = movies_genres.drop('title', axis=1)
movies_genres.head()

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [0]:
# merge for genres estimation
genre_ratings = ratings.merge(movies_genres, left_on='movieId', right_on='movieId', how='inner')
genre_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,2,3.5,1112486027,Adventure
1,1,2,3.5,1112486027,Children
2,1,2,3.5,1112486027,Fantasy
3,5,2,3.0,851527569,Adventure
4,5,2,3.0,851527569,Children


In [0]:
# group ranks for concrete user and evaluate
# mean rating for every genre
ratings_user = genre_ratings.groupby(['userId', 'genres'], as_index=False)['rating'].aggregate(np.mean)
ratings_user.head()

Unnamed: 0,userId,genres,rating
0,1,Action,3.727273
1,1,Adventure,3.787671
2,1,Animation,3.65
3,1,Children,3.605263
4,1,Comedy,3.731707


In [0]:
%%time
# find user representation
user_repr = {}
genres = ratings_user["genres"].unique()
genres_map = {}
for i,g in enumerate(genres):
    genres_map[g] = i 
for u in ratings_user["userId"].unique():
    user_repr[u] = np.zeros(len(genres))
    
def update_user_dict(s):
    user_repr[s["userId"]][genres_map[s["genres"]]] += s["rating"]

# each user (row) is represented by
# len(genres) columns with mean rating
# for the corresponding genre
ratings_user.apply(lambda s : update_user_dict(s),axis=1)

Wall time: 55.8 s


In [0]:
# merging movie and ratings
merged_df = pd.merge(movies, ratings,on="movieId",how="inner")
# adding indexes
merged_df["index"] = merged_df.index
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,index
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",3,4.0,944919407,0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",6,5.0,858275452,1
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",8,4.0,833981871,2
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",10,4.0,943497887,3
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",11,4.5,1230858821,4


In [0]:
%%time
#if RAM is finite
merged_df = merged_df[:10000000]

features = np.zeros((merged_df.shape[0],2*len(genres)))
y = merged_df["rating"]
def update_features(s):
    for g in s["genres"]:
        features[s["index"],genres_map[g]] = 1.0
    features[s["index"],len(genres):] = user_repr[s["userId"]]
    
merged_df.apply(lambda s : update_features(s),axis=1)

Wall time: 8min 3s


In [0]:
#train test split
train_feats = features[:int(features.shape[0]*0.8)]
test_feats = features[int(features.shape[0]*0.8):]
train_y = y[:int(features.shape[0]*0.8)]
test_y = y[int(features.shape[0]*0.8):]

print('train size:', train_feats.shape[0])
print('test size:', test_feats.shape[0])

train size: 8000000
test size: 2000000


## Random Forest

In [0]:
%%time
scores = []
iteration = 1
kfold = sklearn.model_selection.KFold(5, random_state=42)
for train_idx, test_idx in kfold.split(train_feats):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    reg = RandomForestRegressor(n_estimators=2)
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    RMSE = (mean_squared_error(y_test,y_pred))**0.5
    scores.append(RMSE)
    print(iteration,"-fold\nRMSE:", RMSE)
    iteration+=1

1 -fold
RMSE: 1.1222924964078278
2 -fold
RMSE: 1.0820446869750728
3 -fold
RMSE: 1.1184627000043836
4 -fold
RMSE: 1.1048369556419373
5 -fold
RMSE: 1.122316418962252
Wall time: 26min 27s


In [0]:
print('Kfold RMSE mean:', np.mean(scores))

Kfold RMSE mean: 1.1099906515982947


In [0]:
y_pred = reg.predict(test_feats)
RMSE = (mean_squared_error(test_y,y_pred))**0.5
print('\nUnbiased RMSE:', RMSE)


Unbiased RMSE: 1.13949724244758
