# 0. Configuration

In [218]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [219]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd
import datetime as dt
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from math import log2

from itertools import islice, cycle, product

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')


## 1. 1. Helper functions to avoid copy paste

In [220]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [221]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [222]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [223]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [224]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


## 2.2 Data preparation using LightFM Dataset

To use implicit kNN method `fit` we need a sparse matrix in COOrdinate format. To achieve that we will use `scipy.sparse.coo_matrix` from scipy;


In [225]:
def get_coo_matrix(
        df: pd.DataFrame, 
        user_col: str,
        item_col: str, 
        users_mapping: dict, 
        movies_mapping: dict,
        weight_col: str = None
        ):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(movies_mapping.get)
        )
    ))
    return interaction_matrix


In [226]:
# define users mapping
users_inv_mapping = dict(enumerate(interactions_filtered['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)


671

In [227]:
# define movies mapping
movies_inv_mapping = dict(enumerate(interactions_filtered['movieId'].unique()))
movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
len(movies_mapping)


2830

In [228]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat = get_coo_matrix(
    interactions_filtered,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()


In [229]:
train_mat

<671x2830 sparse matrix of type '<class 'numpy.float32'>'
	with 44989 stored elements in Compressed Sparse Row format>

## 2.3. Model Training & Evaluation

In [`implicit`](https://pypi.org/project/implicit/), there are various models and can be groupped into:
- Item-to-Item: KNN based on various similarities - CosineRecommender, BM25Recommender, TFIDFRecommender
- implicit ALS;
- Logistic Matrix Factorization;
- Bayesian Personalized Ranking (BPR)


### 2.3.1. Train Model

In [230]:
from implicit.nearest_neighbours import (
    CosineRecommender,
    BM25Recommender,
    TFIDFRecommender
    )


Note that in item-to-item models we need to provide matrix in the form of item-user by transposing initial COO matrix user-item


In [231]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat)


  0%|          | 0/671 [00:00<?, ?it/s]

### 2.3.2. Evaluate the Model

In [232]:
# let's make sense-check
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 1, row number in matrix - 0


In [233]:
# create mapper for movieId and title names
movie_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [234]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs


Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,653.0,0.861587,653,74458,Mere Brother Ki Dulhan
1,129.0,0.844531,129,1994,The Most Dangerous Game
2,606.0,0.654064,606,8011,Highlander III: The Sorcerer
3,294.0,0.625141,294,70,Million Dollar Baby
4,337.0,0.593856,337,170,28 Days Later
5,648.0,0.577499,648,68954,Longitude
6,579.0,0.571681,579,5956,Joshua
7,399.0,0.561442,399,1088,Whale Rider
8,278.0,0.561442,278,1584,School of Rock
9,150.0,0.557086,150,2100,The Last Castle


# TODO
- Make global train/ global test split -- train the model appropiately and predict on test set;  <b>done!</b>
- Wrap up in function recommendations - implicit_recommend(); <b>done!</b>
- Calculate `NDCG@10` on test set <b>done!</b>

## Train Test split

In [235]:
interactions_filtered["dttm"] = pd.to_datetime(interactions_filtered["timestamp"], unit="s")
interactions_filtered["dttm"] = interactions_filtered["dttm"].dt.strftime("%d-%m-%Y")
interactions_filtered.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,dttm
10,1,1371,2.5,1260759135,14-12-2009
11,1,1405,1.0,1260759203,14-12-2009
13,1,2105,4.0,1260759139,14-12-2009
15,1,2193,2.0,1260759198,14-12-2009
16,1,2294,2.0,1260759108,14-12-2009


Write divider functions

In [236]:
#finding the border on which date is changing
def train_test(df: pd.DataFrame, train_size: float, column: str = 'random'):
    '''
    df: pd.DataFrame that is needed to split;
    train_size: size of the train sample (from 0 to 1);
    column: column the sorting will be based on
    '''
    if column == 'random':
        global_train, global_test = train_test_split(df, train_size=train_size)

    else:
        df = df.sort_values(by=column)
        border_default = int(df.shape[0]*train_size)
        day_change_border = int(border_default - 1)
        while df[column].iloc[day_change_border] ==  df[column].iloc[border_default]:
            day_change_border-=1


        global_train = df.iloc[:day_change_border+1]
        global_test = df.iloc[day_change_border+1:]

    return global_train, global_test

In [237]:
global_train, global_test = train_test(interactions_filtered, 0.8, 'dttm')

print(global_train['dttm'].max(), global_test['dttm'].min())


24-06-2006 24-06-2009


In [238]:
print('global train size: {:.2f}, global test size: {:.2f}'\
      .format(global_train.shape[0]/interactions_filtered.shape[0]\
              , global_test.shape[0]/interactions_filtered.shape[0]))

global train size: 0.80, global test size: 0.20


In [239]:
# check if slicing is done right
print(global_train.shape[0] + global_test.shape[0] == interactions_filtered.shape[0])
print(global_train.shape[0]/interactions_filtered.shape[0] + global_test.shape[0]/interactions_filtered.shape[0])

True
1.0


In [240]:
#global train division
train_1lvl, validation = train_test(global_train, 0.8, 'dttm')
print(train_1lvl.shape[0], validation.shape[0])

28782 7202


In [241]:
train_mat = get_coo_matrix(
    train_1lvl,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [242]:
validation_mat = get_coo_matrix(
    validation,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).T.tocsr()

In [243]:
test_mat = get_coo_matrix(
    global_test,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).T.tocsr()

In [266]:
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)

  0%|          | 0/671 [00:00<?, ?it/s]

In [245]:
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 1, row number in matrix - 0


In [268]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,653.0,0.920074,653,74458,Mere Brother Ki Dulhan
1,129.0,0.91947,129,1994,The Most Dangerous Game
2,267.0,0.604371,267,4973,Sous le Sable
3,87.0,0.499033,87,329,Jurassic Park
4,211.0,0.490865,211,41566,New York Doll
5,579.0,0.48593,579,5956,Joshua
6,606.0,0.464227,606,8011,Highlander III: The Sorcerer
7,133.0,0.453383,133,2005,Sister Act
8,362.0,0.45039,362,471,Bandyta
9,302.0,0.435123,302,1732,The Prisoner of Zenda


In [247]:
recs = cosine_model.recommend(
    row_id,
    validation_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,412.0,0.5,412,1267,Meet the Robinsons
1,86.0,0.447214,86,289,Casablanca
2,111.0,0.416025,111,1278,The Dreamers
3,87.0,0.401601,87,329,Jurassic Park
4,54.0,0.377964,54,552,Pane e Tulipani
5,653.0,0.363137,653,74458,Mere Brother Ki Dulhan
6,270.0,0.346718,270,8873,Wayne's World 2
7,21.0,0.342547,21,248,Pocketful of Miracles
8,43.0,0.33541,43,468,My Own Private Idaho
9,133.0,0.334286,133,2005,Sister Act


In [248]:
recs = cosine_model.recommend(
    row_id,
    test_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,451.0,0.370853,451,1810,Viva Zapata!
1,29.0,0.322072,29,296,Terminator 3: Rise of the Machines
2,305.0,0.294059,305,2668,Sleepy Hollow
3,211.0,0.28833,211,41566,New York Doll
4,508.0,0.272286,508,2993,Una sull'altra
5,357.0,0.265743,357,335,C'era una volta il West
6,101.0,0.264061,101,913,The Thomas Crown Affair
7,267.0,0.262397,267,4973,Sous le Sable
8,129.0,0.258221,129,1994,The Most Dangerous Game
9,231.0,0.258181,231,4641,Read It and Weep


In [249]:
def implicit_recommend(data: pd.DataFrame, model, N: int, userid: str, liked_items=True):
    '''
    data: pd.DataFrame needed for predict on;
    model: trained model;
    N: number of items;
    userid: user for whom we recommend;
    liked_items: filter already liked items
    '''

    mat = get_coo_matrix(
        data,
        user_col = 'userId',
        item_col = 'movieId',
        users_mapping = users_mapping,
        movies_mapping = movies_mapping
        ).T.tocsr()
    

    row_id = users_mapping[userid]
    recs = model.recommend(row_id, mat, N=N, filter_already_liked_items=liked_items)

    recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
    recs['inv_movie_id'] = recs['col_id'].astype(int)
    recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
    recs['title'] = recs['movieId'].map(movie_name_mapper)
    
    return recs
    

In [250]:
a = implicit_recommend(validation, cosine_model, 10, 1, True)
a

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,412.0,0.5,412,1267,Meet the Robinsons
1,86.0,0.447214,86,289,Casablanca
2,111.0,0.416025,111,1278,The Dreamers
3,87.0,0.401601,87,329,Jurassic Park
4,54.0,0.377964,54,552,Pane e Tulipani
5,653.0,0.363137,653,74458,Mere Brother Ki Dulhan
6,270.0,0.346718,270,8873,Wayne's World 2
7,21.0,0.342547,21,248,Pocketful of Miracles
8,43.0,0.33541,43,468,My Own Private Idaho
9,133.0,0.334286,133,2005,Sister Act


## NDCG

In [251]:
def compute_gain(y_value: float, gain_scheme: str) -> float:
    
    gain = {'exp2': 2 ** y_value - 1,
            'const': y_value}

    return float(gain[gain_scheme])

In [285]:
def dcg(y_true: np.array, y_pred: np.array, gain_scheme: str) -> float:
    
    dcg = 0
    argsort = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[argsort]

    for idx, val in enumerate(y_true_sorted, 1):
        gain = compute_gain(val, gain_scheme)
        dcg += gain / log2(idx + 1)
    

    return dcg

In [286]:
def ndcg(y_true: np.array, ys_pred: np.array, k: int, gain_scheme: str = 'const')  -> float:
    
    #cutoff condition
    if len(ys_pred)>k:
        ys_pred_k = ys_pred[:k]
        y_true_k = y_true[:k]
    else:
        ys_pred_k = ys_pred
        y_true_k = y_true
    
    # pred dcg then we calc the same to find max possible
    preds_dcg = dcg(y_true_k, ys_pred_k, gain_scheme)

    # to find IDCG we need to create cutoff after sorting because we can miss values that are relevant, but at the end of the list:
    y_true_sorted = np.sort(y_true)[::-1]
    y_true_sorted_k = y_true_sorted[:k]
    max_possible_dcg = dcg(y_true_sorted_k, y_true_sorted_k, gain_scheme)

    if max_possible_dcg == 0:
        return preds_dcg / 1
    else:   
        return preds_dcg / max_possible_dcg

In [254]:
map_films = {}
map_score = {}

for num, usr in enumerate(global_test['userId']):
    pred = implicit_recommend(global_test, cosine_model, 10, usr, True)
    map_films[usr] = np.array(pred['movieId'])
    #Use similarity score as predicted relevance
    map_score[usr] = np.array([int(i*10) for n, i in enumerate(pred['similarity'])])
    


In [255]:
# we get all interacted items for each user and save it in dictionary {'userId': [items list]}
known_items = interactions_filtered.groupby('userId')['movieId'].apply(list).to_dict()
len(known_items)

rating_dict = {}
for user_id, movies in known_items.items():
    rating_dict[user_id] = [interactions_filtered[(interactions_filtered['userId'] == user_id)\
                                                    & (interactions_filtered['movieId'] == movie)]['rating'].iloc[0] for movie in movies]




In [256]:
global_test['recfilms'] = global_test['userId'].map(map_films)
global_test['relevance'] = global_test['userId'].map(map_score)
global_test['watched'] = global_test['userId'].map(known_items)
global_test['watched_films_ratings'] = global_test['userId'].map(rating_dict)

In [257]:
def get_y_true(row):
    watched_films = row['watched']
    watched_films_ratings = row['watched_films_ratings']
    recfilms = row['recfilms']
    y_true = []
    for recfilm in recfilms:
        if recfilm in watched_films:
            y_true.append(int(watched_films_ratings[watched_films.index(recfilm)]))
        else:
            y_true.append(0)
    return y_true

global_test['y_true'] = global_test.apply(get_y_true, axis=1)


In [287]:
global_test[['userId', 'y_true', 'relevance']].head(5)

Unnamed: 0,userId,y_true,relevance
41996,301,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41988,301,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
80243,547,"[4, 5, 0, 4, 3, 0, 5, 4, 4, 3]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41986,301,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41985,301,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"


In [288]:
global_test['NDCG'] = global_test.apply(
    lambda row: ndcg(np.array(row['y_true']), np.array(row['relevance']), k=10, gain_scheme='exp2'), 
    axis=1)

In [289]:
results = global_test[['userId', 'NDCG', 'y_true', 'relevance']]
results.head(5)

Unnamed: 0,userId,NDCG,y_true,relevance
41996,301,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41988,301,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
80243,547,0.922532,"[4, 5, 0, 4, 3, 0, 5, 4, 4, 3]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41986,301,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
41985,301,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"


In [278]:
results[results['NDCG']<1]

Unnamed: 0,userId,NDCG,y_true,relevance
80243,547,0.922532,"[4, 5, 0, 4, 3, 0, 5, 4, 4, 3]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
80264,547,0.922532,"[4, 5, 0, 4, 3, 0, 5, 4, 4, 3]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
51882,380,0.930759,"[3, 5, 0, 2, 0, 0, 0, 4, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
94807,624,0.860687,"[0, 5, 0, 3, 4, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
94842,624,0.860687,"[0, 5, 0, 3, 4, 0, 0, 0, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
...,...,...,...,...
45521,324,0.386853,"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
45519,324,0.386853,"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
45518,324,0.386853,"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
45515,324,0.386853,"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0]","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
