In [47]:
import os
path = os.getcwd()
os.chdir(path)

# magic to print version
%reload_ext autoreload
# magic so that the notebook will reload external python modules
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


import sim, bpr, metrics
import numpy as np
import pandas as pd
from subprocess import call
from sklearn.utils import shuffle

In [24]:
# constants
ds_dir = "datasets/"
ds_name = "ml-1m"
ds_rel_path = os.path.join(ds_dir, ds_name)

col_user_id = "user_id"
col_movie_id = "movie_id"
col_rating = "rating"
col_ts = "timestamp"
col_genres = "genres"

min_similarity = 0.7

In [25]:
# init libs
similarity = sim.Similarity()

In [26]:
def load_ds(data_name, cols):
    os.makedirs(ds_dir, exist_ok=True)
    ds_path = os.path.join(ds_rel_path, data_name + ".dat")
    zip_file_name = ds_rel_path + ".zip"
    if not os.path.isdir(ds_rel_path):
        call(
            [
                "curl",
                "-o",
                zip_file_name,
                "http://files.grouplens.org/datasets/movielens/" + ds_name + ".zip",
            ]
        )
        call(["unzip", zip_file_name, "-d", ds_dir])

    return pd.read_csv(ds_path, sep="::", names=cols, encoding="latin-1")

## Ratings 
Load ratings dataset.

In [27]:
rating_cols = [col_user_id, col_movie_id, col_rating, col_ts]
ratings = load_ds("ratings", rating_cols)
print("ratings shape: ", ratings.shape)
ratings.head()

ratings shape:  (1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Movies
Load movies datasets.

In [28]:
movies_cols = [col_movie_id, "movie_title", col_genres]
movies = load_ds("movies", movies_cols)
print("movies shape: ", movies.shape)
movies.head()

movies shape:  (3883, 3)


Unnamed: 0,movie_id,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Genres
Extract genres and put them in a map for future processing.

In [29]:
movies_genres = {}
for _, movie in movies.iterrows():
    movies_genres[movie[col_movie_id]] = movie[col_genres].split("|")

print("first movie genres: ", movies_genres[1])

first movie genres:  ['Animation', "Children's", 'Comedy']


## Users
Load users dataset.

In [30]:
user_cols = [col_user_id, "gender", "age", "ratings", "zip code"]
users = load_ds("users", user_cols)
print("users shape: ", users.shape)
users.head()

users shape:  (6040, 5)


Unnamed: 0,user_id,gender,age,ratings,zip code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Content Based Modification

### Similar movies finder
This function finds similar movies to a certain movie and calculates Jaccard similarity for it.

In [31]:
def content_based_similar_movies(i, similarity_percent):
    sim_movies = []
    for _, movie in movies.iterrows():
        j = int(movie["movie_id"])
        if i == j:
            continue
        genre_sim = similarity.Jaccard(movies_genres, i, j)
        if genre_sim > similarity_percent:
            sim_movies.append({"movie_id": j, "sim": genre_sim})
    sim_movies.sort(key=lambda sim_movies: sim_movies["sim"], reverse=True)
    df = pd.DataFrame(sim_movies)
    # df['movie_id'] = df['movie_id'].astype('int')
    return df


similar_movies = content_based_similar_movies(9, min_similarity)
similar_movies.head()

Unnamed: 0,movie_id,sim
0,20,1.0
1,71,1.0
2,145,1.0
3,204,1.0
4,227,1.0


### New Ranks addition
Add new ranking items to the dataset according to the current similar movie ranks for each user.

In [32]:
def add_similar_ranking(movies_to_check, users_to_check):
    new_ratings = ratings.copy()
    shuffle_users = shuffle(users)
    i = 0
    for m_id, mg in movies_genres.items():
        if i > movies_to_check:
            break
        i += 1
        for u_i, u in shuffle_users.iterrows():
            if u_i < users_to_check:
                curr_rating = ratings[
                    (ratings[col_user_id] == u[col_user_id])
                    & (ratings[col_movie_id] == m_id)
                ]

                if not curr_rating.empty:
                    curr_rating_val = curr_rating.iloc[0][col_rating]
                    if curr_rating_val >= 4.0 or curr_rating_val <= 3.0:
                        sim_movies = content_based_similar_movies(m_id, min_similarity)
                        for _, sm in sim_movies.iterrows():
                            new_rating = new_ratings[
                                (new_ratings[col_user_id] == u[col_user_id])
                                & (new_ratings[col_movie_id] == sm[col_movie_id])
                            ]
                            if new_rating.empty:
                                new_row = {
                                    col_user_id: u[col_user_id],
                                    col_movie_id: int(sm[col_movie_id]),
                                    col_rating: curr_rating_val,
                                    col_ts: 1704397300,
                                }
                                loc = int(len(new_ratings))
                                new_ratings.loc[loc] = new_row
    
    return new_ratings


new_ratings = add_similar_ranking(100, 100)
new_ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## BPR

## Create Interaction Matrix
Create user-movie interaction matrix and create train and test sets for the next steps.
> Note: Over untouched ratings list

In [33]:
threshold = 3
X, ratings = bpr.create_matrix(
    ratings, col_user_id, col_movie_id, col_rating, threshold
)

In [34]:
X_train, X_test = bpr.create_train_test(X, test_size=0.2, seed=1234)

### Fit the model
Fit the module with untouched ratings.

In [35]:
# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}
bpr = bpr.BPR(**bpr_params)
np.int = int

bpr.fit(X_train)

BPR:   0%|          | 0/160 [00:00<?, ?it/s]

BPR: 100%|██████████| 160/160 [00:33<00:00,  4.83it/s]


<bpr.BPR at 0x7f9fb46c2410>

### Get AUC Score
Calculate the AUC score of the predicted ranks for untouched ratings.

In [36]:
train_auc, train_trues, train_preds = metrics.mae_auc_score(bpr, X_train)
print("train auc", train_auc)

test_auc, test_trues, test_preds = metrics.mae_auc_score(bpr, X_test)
print("train auc", test_auc)

train auc 0.8862870266122028
train auc 0.8525653101060211


## Create Interaction Matrix
Create user-movie interaction matrix and create train and test sets for the next steps.
> Note: Over extended ratings list

In [40]:
X, new_ratings = bpr.create_matrix(
    new_ratings, col_user_id, col_movie_id, col_rating, threshold
)

In [41]:
X_train, X_test = bpr.create_train_test(X, test_size=0.2, seed=1234)
# parameters were randomly chosen
bpr_params = {
    "reg": 0.01,
    "learning_rate": 0.1,
    "n_iters": 160,
    "n_factors": 15,
    "batch_size": 100,
}
bpr = bpr.BPR(**bpr_params)
np.int = int

bpr.fit(X_train)

BPR: 100%|██████████| 160/160 [00:32<00:00,  4.98it/s]


<bpr.BPR at 0x7f9f614ed300>

In [42]:
train_auc, train_trues, train_preds = metrics.mae_auc_score(bpr, X_train)
print("train auc", train_auc)

test_auc, test_trues, test_preds = metrics.mae_auc_score(bpr, X_test)
print("train auc", test_auc)

train auc 0.8916725171527122
train auc 0.8592517632566062
