In [97]:
import os
path = os.getcwd()
os.chdir(path)

# magic to print version
%reload_ext autoreload
# magic so that the notebook will reload external python modules
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


import bpr, metrics, content
import numpy as np
import pandas as pd
from subprocess import call
from sklearn.utils import shuffle
from common.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_TEST_SIZE,
    DEFAULT_MIN_SIMILARITY,
    DEFAULT_SIMILARITY_COL
)
from common.enums import SimilarityMethod

In [98]:
# constants
ds_dir = "datasets/"
ds_name = "ml-1m"
ds_rel_path = os.path.join(ds_dir, ds_name)

col_genres = "genres"
similar_movies_rate_count = 10
new_rates_per_movie_count = 50

# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}

In [99]:
# dataset loader function
def load_ds(data_name, cols):
    os.makedirs(ds_dir, exist_ok=True)
    ds_path = os.path.join(ds_rel_path, data_name + ".dat")
    zip_file_name = ds_rel_path + ".zip"
    if not os.path.isdir(ds_rel_path):
        call(
            [
                "curl",
                "-o",
                zip_file_name,
                "http://files.grouplens.org/datasets/movielens/" + ds_name + ".zip",
            ]
        )
        call(["unzip", zip_file_name, "-d", ds_dir])

    return pd.read_csv(ds_path, sep="::", names=cols, encoding="latin-1")

## Ratings

Load ratings dataset.


In [100]:
rating_cols = [
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
]
ratings = load_ds("ratings", rating_cols)
print("ratings shape: ", ratings.shape)
ratings.head()

ratings shape:  (1000209, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Movies

Load movies datasets.


In [101]:
movies_cols = [DEFAULT_ITEM_COL, "movie_title", col_genres]
movies = load_ds("movies", movies_cols)
print("movies shape: ", movies.shape)
movies.head()

movies shape:  (3883, 3)


Unnamed: 0,itemID,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Genres

Extract genres and put them in a map for future processing.


In [102]:
movies_genres = {}
for _, movie in movies.iterrows():
    movies_genres[movie[DEFAULT_ITEM_COL]] = movie[col_genres].split("|")

print("first movie genres: ", movies_genres[1])

first movie genres:  ['Animation', "Children's", 'Comedy']


## Users

Load users dataset.


In [103]:
user_cols = [DEFAULT_USER_COL, "gender", "age", "ratings", "zip code"]
users = load_ds("users", user_cols)
print("users shape: ", users.shape)
users.head()

users shape:  (6040, 5)


Unnamed: 0,userID,gender,age,ratings,zip code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Content Based Modification


### Similar movies finder

This function finds similar movies to a certain movie and calculates Overlap Coefficient similarity for it.


In [104]:
movies_light = movies.drop(["movie_title"], axis=1)
cb = content.ContentBased(movies_light)

In [105]:
movies_sim = cb.get_items_sim_merged(
    SimilarityMethod.OverlapCoefficient, col_genres, DEFAULT_SIMILARITY_COL, "|"
)
movies_sim

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.000000
1,1,2,0.333333
2,1,3,0.500000
3,1,4,0.500000
4,1,5,1.000000
...,...,...,...
15077684,3952,3948,0.000000
15077685,3952,3949,1.000000
15077686,3952,3950,1.000000
15077687,3952,3951,1.000000


Filter out the movies with similarity higher than DEFAULT_MIN_SIMILARITY

In [106]:
movies_sim_reduced = movies_sim[movies_sim["sim"] > DEFAULT_MIN_SIMILARITY]
movies_sim_reduced

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.0
4,1,5,1.0
12,1,13,1.0
18,1,19,1.0
37,1,38,1.0
...,...,...,...
15077683,3952,3947,1.0
15077685,3952,3949,1.0
15077686,3952,3950,1.0
15077687,3952,3951,1.0


Extract the similar movies for each movie equal to similar_movies_rate_count

In [107]:
movies_sim_reduced_top = movies_sim.groupby(DEFAULT_ITEM_COL + "_x").head(
    similar_movies_rate_count
)
movies_sim_reduced_top

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.000000
1,1,2,0.333333
2,1,3,0.500000
3,1,4,0.500000
4,1,5,1.000000
...,...,...,...
15073811,3952,6,0.500000
15073812,3952,7,0.000000
15073813,3952,8,0.000000
15073814,3952,9,0.000000


Cleanup extra fields and create user-movie mapping for all of the users-items

In [108]:
light_users = users.drop(["gender", "age"], axis=1)
light_users["key"] = 1
movies_users = movies_light.merge(light_users, on=["key"])
movies_users_ratings = (
    movies_users.merge(ratings, how="left")
    .drop(["zip code", "genres", "key", "ratings", "timestamp"], axis=1)
    .fillna(0)
)
movies_users_ratings

Unnamed: 0,itemID,userID,rating
0,1,1,5.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
23453315,3952,6036,0.0
23453316,3952,6037,0.0
23453317,3952,6038,0.0
23453318,3952,6039,0.0


Filter each user's ratings higher than min_rate_value

In [109]:
min_rate_value = 3.0
movies_users_ratings_high = movies_users_ratings[
    movies_users_ratings[DEFAULT_RATING_COL] > min_rate_value
]
movies_users_ratings_high

Unnamed: 0,itemID,userID,rating
0,1,1,5.0
5,1,6,4.0
7,1,8,4.0
8,1,9,5.0
9,1,10,5.0
...,...,...,...
23452684,3952,5405,4.0
23452754,3952,5475,5.0
23453091,3952,5812,4.0
23453116,3952,5837,4.0


Call content-based methods of the corresponding class to create new rankings according to the similar movies.

In [110]:
new_ratings = cb.get_new_ratings(
    movies_sim_reduced_top, movies_users_ratings_high, new_rates_per_movie_count
)
new_ratings

Unnamed: 0,userID,rating,itemID
0,1,5.0,2
1,1,5.0,3
2,1,5.0,4
3,1,5.0,5
4,1,5.0,6
...,...,...,...
5721428,1615,4.0,6
5721429,1615,4.0,7
5721430,1615,4.0,8
5721431,1615,4.0,9


### New Ranks addition

Add new ranking items to the dataset according to the current similar movie ranks for each user.


In [111]:
print("added ratings: ", new_ratings.shape[0])
extended_ratings = pd.concat([ratings, new_ratings])
extended_ratings

added ratings:  301790


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5.0,978300760.0
1,1,661,3.0,978302109.0
2,1,914,3.0,978301968.0
3,1,3408,4.0,978300275.0
4,1,2355,5.0,978824291.0
...,...,...,...,...
5721428,1615,6,4.0,
5721429,1615,7,4.0,
5721430,1615,8,4.0,
5721431,1615,9,4.0,


## BPR


## Create Interaction Matrix

Create user-movie interaction matrix and create train and test sets for the next steps.

> Note: Over untouched ratings list


In [112]:
threshold = 3

X, _ = bpr.create_matrix(
    ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

X_extended, _ = bpr.create_matrix(
    extended_ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

In [113]:
X_train, X_test = bpr.create_train_test(X, test_size=DEFAULT_TEST_SIZE, seed=1234)
X_extended_train, X_extended_test = bpr.create_train_test(X_extended, test_size=DEFAULT_TEST_SIZE, seed=1234)

### Fit the model

Fit the module with untouched ratings.


In [114]:
np.int = int

bpr_original = bpr.BPR(**bpr_params)
bpr_original.fit(X_train)

bpr_extended = bpr.BPR(**bpr_params)
bpr_extended.fit(X_extended_train)

BPR:   0%|          | 0/160 [00:00<?, ?it/s]

BPR: 100%|██████████| 160/160 [00:31<00:00,  5.11it/s]
BPR: 100%|██████████| 160/160 [00:31<00:00,  5.16it/s]


<bpr.BPR at 0x7f77246a1e70>

### Get AUC Score

Calculate the AUC score of the predicted ranks for untouched ratings.


In [115]:
train_score = metrics.auc_score(bpr_original, X_train, min_rate_value)
test_score = metrics.auc_score(bpr_original, X_test, min_rate_value)
print("Train AUC: %f, NDCG: %f" % train_score)
print("Test  AUC: %f, NDCG: %f" % test_score)

train_extended_score = metrics.auc_score(bpr_extended, X_extended_train, min_rate_value)
test_extended_score = metrics.auc_score(bpr_extended, X_extended_test, min_rate_value)
print("Extended Train AUC: %f, NDCG: %f" % train_extended_score)
print("Extended Test  AUC: %f, NDCG: %f" % test_extended_score)

Train AUC: 0.886287, NDCG: 0.534955
Test  AUC: 0.852563, NDCG: 0.342198
Extended Train AUC: 0.891349, NDCG: 0.587118
Extended Test  AUC: 0.864642, NDCG: 0.377536
