In [45]:
import os
path = os.getcwd()
os.chdir(path)

# magic to print version
%reload_ext autoreload
# magic so that the notebook will reload external python modules
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


import bpr, metrics, content
import numpy as np
import pandas as pd
from subprocess import call
from sklearn.utils import shuffle
from common.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_TEST_SIZE,
    DEFAULT_MIN_SIMILARITY,
    DEFAULT_SIMILARITY_COL
)
from common.enums import SimilarityMethod

In [46]:
# constants
ds_dir = "datasets/"
ds_name = "ml-1m"
ds_rel_path = os.path.join(ds_dir, ds_name)

col_genres = "genres"
new_movies_rate_count = 10
new_users_rate_count = 50

# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}

In [47]:
def load_ds(data_name, cols):
    os.makedirs(ds_dir, exist_ok=True)
    ds_path = os.path.join(ds_rel_path, data_name + ".dat")
    zip_file_name = ds_rel_path + ".zip"
    if not os.path.isdir(ds_rel_path):
        call(
            [
                "curl",
                "-o",
                zip_file_name,
                "http://files.grouplens.org/datasets/movielens/" + ds_name + ".zip",
            ]
        )
        call(["unzip", zip_file_name, "-d", ds_dir])

    return pd.read_csv(ds_path, sep="::", names=cols, encoding="latin-1")

## Ratings

Load ratings dataset.


In [48]:
rating_cols = [
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
]
ratings = load_ds("ratings", rating_cols)
print("ratings shape: ", ratings.shape)
ratings.head()

ratings shape:  (1000209, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Movies

Load movies datasets.


In [49]:
movies_cols = [DEFAULT_ITEM_COL, "movie_title", col_genres]
movies = load_ds("movies", movies_cols)
print("movies shape: ", movies.shape)
movies.head()

movies shape:  (3883, 3)


Unnamed: 0,itemID,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Genres

Extract genres and put them in a map for future processing.


In [50]:
movies_genres = {}
for _, movie in movies.iterrows():
    movies_genres[movie[DEFAULT_ITEM_COL]] = movie[col_genres].split("|")

print("first movie genres: ", movies_genres[1])

first movie genres:  ['Animation', "Children's", 'Comedy']


## Users

Load users dataset.


In [51]:
user_cols = [DEFAULT_USER_COL, "gender", "age", "ratings", "zip code"]
users = load_ds("users", user_cols)
print("users shape: ", users.shape)
users.head()

users shape:  (6040, 5)


Unnamed: 0,userID,gender,age,ratings,zip code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Content Based Modification


### Similar movies finder

This function finds similar movies to a certain movie and calculates Jaccard similarity for it.


In [52]:
cb = content.ContentBased(movies_genres)

In [53]:
movies_light = movies.drop(["movie_title"], axis=1)
cb = content.ContentBased(movies_light)
movies_sim = cb.get_items_sim_merged(
    SimilarityMethod.Jaccard, col_genres, DEFAULT_SIMILARITY_COL, "|"
)
movies_sim

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.000000
1,1,2,0.200000
2,1,3,0.250000
3,1,4,0.250000
4,1,5,0.333333
...,...,...,...
15077684,3952,3948,0.000000
15077685,3952,3949,0.500000
15077686,3952,3950,0.500000
15077687,3952,3951,0.500000


In [54]:
movies_sim_reduced = movies_sim[movies_sim["sim"] > DEFAULT_MIN_SIMILARITY]
movies_sim_reduced

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.0
1050,1,1064,1.0
2072,1,2141,1.0
2073,1,2142,1.0
2285,1,2354,1.0
...,...,...,...
15077468,3952,3731,1.0
15077547,3952,3810,1.0
15077584,3952,3848,1.0
15077672,3952,3936,1.0


In [55]:
movies_sim_reduced_top = movies_sim.groupby(DEFAULT_ITEM_COL + "_x").head(
    new_movies_rate_count
)
movies_sim_reduced_top

Unnamed: 0,itemID_x,itemID_y,sim
0,1,1,1.000000
1,1,2,0.200000
2,1,3,0.250000
3,1,4,0.250000
4,1,5,0.333333
...,...,...,...
15073811,3952,6,0.250000
15073812,3952,7,0.000000
15073813,3952,8,0.000000
15073814,3952,9,0.000000


In [56]:
light_users = users.drop(["gender", "age"], axis=1)
light_users["key"] = 1
movies_users = movies_light.merge(light_users, on=["key"])
movies_users_ratings = (
    movies_users.merge(ratings, how="left")
    .drop(["zip code", "genres", "key", "ratings", "timestamp"], axis=1)
    .fillna(0)
)
movies_users_ratings

Unnamed: 0,itemID,userID,rating
0,1,1,5.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
23453315,3952,6036,0.0
23453316,3952,6037,0.0
23453317,3952,6038,0.0
23453318,3952,6039,0.0


In [57]:
min_rate_value = 3.0
movies_users_ratings_high = movies_users_ratings[
    movies_users_ratings[DEFAULT_RATING_COL] > min_rate_value
]
movies_users_ratings_high

Unnamed: 0,itemID,userID,rating
0,1,1,5.0
5,1,6,4.0
7,1,8,4.0
8,1,9,5.0
9,1,10,5.0
...,...,...,...
23452684,3952,5405,4.0
23452754,3952,5475,5.0
23453091,3952,5812,4.0
23453116,3952,5837,4.0


In [58]:
new_ratings = cb.get_new_ratings(
    movies_sim_reduced_top, movies_users_ratings_high, new_users_rate_count
)
new_ratings

Unnamed: 0,userID,rating,itemID
0,1,5.0,2
1,1,5.0,3
2,1,5.0,4
3,1,5.0,5
4,1,5.0,6
...,...,...,...
5721428,1615,4.0,6
5721429,1615,4.0,7
5721430,1615,4.0,8
5721431,1615,4.0,9


### New Ranks addition

Add new ranking items to the dataset according to the current similar movie ranks for each user.


In [59]:
print("added ratings: ", new_ratings.shape[0])
extended_ratings = pd.concat([ratings, new_ratings])
extended_ratings

added ratings:  301790


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5.0,978300760.0
1,1,661,3.0,978302109.0
2,1,914,3.0,978301968.0
3,1,3408,4.0,978300275.0
4,1,2355,5.0,978824291.0
...,...,...,...,...
5721428,1615,6,4.0,
5721429,1615,7,4.0,
5721430,1615,8,4.0,
5721431,1615,9,4.0,


## BPR


## Create Interaction Matrix

Create user-movie interaction matrix and create train and test sets for the next steps.

> Note: Over untouched ratings list


In [60]:
threshold = 3

X, _ = bpr.create_matrix(
    ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

X_extended, _ = bpr.create_matrix(
    extended_ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

In [61]:
X_train, X_test = bpr.create_train_test(X, test_size=DEFAULT_TEST_SIZE, seed=1234)
X_extended_train, X_extended_test = bpr.create_train_test(X_extended, test_size=DEFAULT_TEST_SIZE, seed=1234)

### Fit the model

Fit the module with untouched ratings.


In [62]:
np.int = int

bpr_original = bpr.BPR(**bpr_params)
bpr_original.fit(X_train)

bpr_extended = bpr.BPR(**bpr_params)
bpr_extended.fit(X_extended_train)

BPR:   0%|          | 0/160 [00:00<?, ?it/s]

BPR: 100%|██████████| 160/160 [00:48<00:00,  3.33it/s]
BPR: 100%|██████████| 160/160 [00:47<00:00,  3.37it/s]


<bpr.BPR at 0x7f4d516fefb0>

### Get AUC Score

Calculate the AUC score of the predicted ranks for untouched ratings.


In [63]:
train_auc, train_trues, train_preds = metrics.auc_score(bpr_original, X_train)
print("train auc", train_auc)

test_auc, test_trues, test_preds = metrics.auc_score(bpr_original, X_test)
print("test auc", test_auc)

train_extended_auc, train_extended_trues, train_extended_preds = metrics.auc_score(bpr_extended, X_extended_train)
print("train extended auc", train_extended_auc)

test_extended_auc, test_extended_trues, test_extended_preds = metrics.auc_score(bpr_extended, X_extended_test)
print("test extended auc", test_extended_auc)

train auc 0.8848643764145597
test auc 0.8510006478895107
train extended auc 0.8908093505106474
test extended auc 0.8637797311648515
