In [19]:
import os
path = os.getcwd()
os.chdir(path)

# magic to print version
%reload_ext autoreload
# magic so that the notebook will reload external python modules
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


import content, metrics
import numpy as np
import pandas as pd
from subprocess import call
from sklearn.utils import shuffle
from data_split import split_data
from WRMF.wrmf import *
from WRMF import wrmf_rec
from common.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_TEST_SIZE,
    DEFAULT_MIN_SIMILARITY
)
from common.enums import SimilarityMethod

In [20]:
# constants
ds_dir = "datasets/"
ds_name = "ml-1m"
ds_rel_path = os.path.join(ds_dir, ds_name)

col_genres = "genres"

# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}

In [21]:
def load_ds(data_name, cols):
    os.makedirs(ds_dir, exist_ok=True)
    ds_path = os.path.join(ds_rel_path, data_name + ".dat")
    zip_file_name = ds_rel_path + ".zip"
    if not os.path.isdir(ds_rel_path):
        call(
            [
                "curl",
                "-o",
                zip_file_name,
                "http://files.grouplens.org/datasets/movielens/" + ds_name + ".zip",
            ]
        )
        call(["unzip", zip_file_name, "-d", ds_dir])

    return pd.read_csv(ds_path, sep="::", names=cols, encoding="latin-1")

## Ratings 
Load ratings dataset.

In [22]:
rating_cols = [DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL]
ratings = load_ds("ratings", rating_cols)
print("ratings shape: ", ratings.shape)
ratings.head()

ratings shape:  (1000209, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Movies
Load movies datasets.

In [23]:
movies_cols = [DEFAULT_ITEM_COL, "movie_title", col_genres]
movies = load_ds("movies", movies_cols)
print("movies shape: ", movies.shape)
movies.head()

movies shape:  (3883, 3)


Unnamed: 0,itemID,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Genres
Extract genres and put them in a map for future processing.

In [24]:
movies_genres = {}
for _, movie in movies.iterrows():
    movies_genres[movie[DEFAULT_ITEM_COL]] = movie[col_genres].split("|")

print("first movie genres: ", movies_genres[1])

first movie genres:  ['Animation', "Children's", 'Comedy']


## Users
Load users dataset.

In [25]:
user_cols = [DEFAULT_USER_COL, "gender", "age", "ratings", "zip code"]
users = load_ds("users", user_cols)
print("users shape: ", users.shape)
users.head()

users shape:  (6040, 5)


Unnamed: 0,userID,gender,age,ratings,zip code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Content Based Modification

### Similar movies finder
This function finds similar movies to a certain movie and calculates Jaccard similarity for it.

In [26]:
cb = content.ContentBased(movies_genres)

In [27]:
similar_movies = cb.content_based_similar_items(
    SimilarityMethod.Jaccard, ratings, 9, DEFAULT_MIN_SIMILARITY
)
similar_movies.head()

Unnamed: 0,itemID,sim
0,459,1.0
1,1385,1.0
2,2947,1.0
3,1429,1.0
4,3624,1.0


### New Ranks addition
Add new ranking items to the dataset according to the current similar movie ranks for each user.

In [31]:
extended_ratings = cb.add_similar_ranking(
    SimilarityMethod.Cosine, ratings, movies, users, 100, 100
)
extended_ratings.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## WRMF

## Create Interaction Matrix and Fit
Create user-movie interaction matrix and create train and test sets for the next steps.
> Note: Over untouched ratings list

In [29]:
train, test = split_data(ratings, split_strategy="random_by_user", random_state=0)
wrmf = WRMF(train, weight_strategy="uniform_pos")
model = train_cornac(wrmf, train)

k = 10
top_k = wrmf_rec.recommend_top_k(model, train, k)
metrics.ranking_metrics(top_k, test)

  1%|          | 36/6040 [00:00<00:16, 357.94it/s]

100%|██████████| 6040/6040 [00:14<00:00, 429.70it/s]


maximum of weights=1.0, minimum=1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!


{'Precision@k': 0.2963741721854305,
 'Recall@k': 0.13376981939449878,
 'NDCG@k': 0.33429797891976465}

## Create Interaction Matrix
Create user-movie interaction matrix and create train and test sets for the next steps.
> Note: Over extended ratings list

In [32]:
train, test = split_data(extended_ratings, split_strategy="random_by_user", random_state=0)
wrmf = WRMF(train, weight_strategy="uniform_pos")
model = train_cornac(wrmf, train)

k = 10
top_k = wrmf_rec.recommend_top_k(model, train, k)
metrics.ranking_metrics(top_k, test)

100%|██████████| 6040/6040 [00:14<00:00, 406.08it/s]


maximum of weights=1.0, minimum=1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!


{'Precision@k': 0.306158940397351,
 'Recall@k': 0.13417043588647853,
 'NDCG@k': 0.3435059091085476}