In [25]:
import os
path = os.getcwd()
os.chdir(path)

# magic to print version
%reload_ext autoreload
# magic so that the notebook will reload external python modules
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


import bpr, metrics, content
import numpy as np
import pandas as pd
from subprocess import call
from sklearn.utils import shuffle
from common.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_TEST_SIZE,
    DEFAULT_MIN_SIMILARITY
)
from common.enums import SimilarityMethod

In [2]:
# constants
ds_dir = "datasets/"
ds_name = "ml-1m"
ds_rel_path = os.path.join(ds_dir, ds_name)

col_genres = "genres"

# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}

In [3]:
def load_ds(data_name, cols):
    os.makedirs(ds_dir, exist_ok=True)
    ds_path = os.path.join(ds_rel_path, data_name + ".dat")
    zip_file_name = ds_rel_path + ".zip"
    if not os.path.isdir(ds_rel_path):
        call(
            [
                "curl",
                "-o",
                zip_file_name,
                "http://files.grouplens.org/datasets/movielens/" + ds_name + ".zip",
            ]
        )
        call(["unzip", zip_file_name, "-d", ds_dir])

    return pd.read_csv(ds_path, sep="::", names=cols, encoding="latin-1")

## Ratings

Load ratings dataset.


In [4]:
rating_cols = [
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
]
ratings = load_ds("ratings", rating_cols)
print("ratings shape: ", ratings.shape)
ratings.head()

ratings shape:  (1000209, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Movies

Load movies datasets.


In [5]:
movies_cols = [DEFAULT_ITEM_COL, "movie_title", col_genres]
movies = load_ds("movies", movies_cols)
print("movies shape: ", movies.shape)
movies.head()

movies shape:  (3883, 3)


Unnamed: 0,itemID,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Genres

Extract genres and put them in a map for future processing.


In [6]:
movies_genres = {}
for _, movie in movies.iterrows():
    movies_genres[movie[DEFAULT_ITEM_COL]] = movie[col_genres].split("|")

print("first movie genres: ", movies_genres[1])

first movie genres:  ['Animation', "Children's", 'Comedy']


## Users

Load users dataset.


In [7]:
user_cols = [DEFAULT_USER_COL, "gender", "age", "ratings", "zip code"]
users = load_ds("users", user_cols)
print("users shape: ", users.shape)
users.head()

users shape:  (6040, 5)


Unnamed: 0,userID,gender,age,ratings,zip code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Content Based Modification


### Similar movies finder

This function finds similar movies to a certain movie and calculates Jaccard similarity for it.


In [8]:
cb = content.ContentBased(movies_genres)

In [9]:
similar_movies = cb.content_based_similar_items(
    SimilarityMethod.Jaccard, ratings, 9, DEFAULT_MIN_SIMILARITY
)
similar_movies.head()

Unnamed: 0,itemID,sim
0,459,1.0
1,1385,1.0
2,2947,1.0
3,1429,1.0
4,3624,1.0


### New Ranks addition

Add new ranking items to the dataset according to the current similar movie ranks for each user.


In [10]:
extended_ratings = cb.add_similar_ranking(
    SimilarityMethod.Jaccard, ratings, movies, users, 100, 100
)
extended_ratings.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## BPR


## Create Interaction Matrix

Create user-movie interaction matrix and create train and test sets for the next steps.

> Note: Over untouched ratings list


In [11]:
threshold = 3
X, ratings = bpr.create_matrix(
    ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

X_extended, extended_ratings = bpr.create_matrix(
    extended_ratings, DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, threshold
)

In [12]:
X_train, X_test = bpr.create_train_test(X, test_size=DEFAULT_TEST_SIZE, seed=1234)
X_extended_train, X_extended_test = bpr.create_train_test(X_extended, test_size=DEFAULT_TEST_SIZE, seed=1234)

### Fit the model

Fit the module with untouched ratings.


In [16]:
np.int = int

bpr_original = bpr.BPR(**bpr_params)
bpr_original.fit(X_train)

bpr_extended = bpr.BPR(**bpr_params)
bpr_extended.fit(X_extended_train)

BPR:   0%|          | 0/160 [00:00<?, ?it/s]

BPR: 100%|██████████| 160/160 [00:32<00:00,  4.86it/s]
BPR: 100%|██████████| 160/160 [00:36<00:00,  4.43it/s]


<bpr.BPR at 0x7f1e3f82f820>

### Get AUC Score

Calculate the AUC score of the predicted ranks for untouched ratings.


In [29]:
train_auc, train_trues, train_preds = metrics.auc_score(bpr_original, X_train)
print("train auc", train_auc)

test_auc, test_trues, test_preds = metrics.auc_score(bpr_original, X_test)
print("test auc", test_auc)

train_extended_auc, train_extended_trues, train_extended_preds = metrics.auc_score(bpr_extended, X_extended_train)
print("train extended auc", train_extended_auc)

test_extended_auc, test_extended_trues, test_extended_preds = metrics.auc_score(bpr_extended, X_extended_test)
print("test extended auc", test_extended_auc)

train auc 0.8845325124298504
test auc 0.8508539552090765
train extended auc 0.8921885084709678
test extended auc 0.8594685617047872
