# Imports

In [1]:
import os
import nltk
import math
import re
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import dot
from numpy.linalg import norm
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from transformers import BertTokenizer
from nltk.stem.porter import PorterStemmer

  from .autonotebook import tqdm as notebook_tqdm


# Constants

In [2]:
train_test_ratio = 0.80
k=10

# Load dataframes and Train-test split

In [3]:
# Read and load ratings dataframe.
data = pd.read_csv(os.path.join(os.getcwd(), 'data', 'ml-latest-small', 'ratings.csv'))
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
# create list for train and test
train_list = list()
test_list = list()

In [5]:
# Update the above two list
def apply_function(df: pd.DataFrame):
    global train_test_ratio, train_list, test_list
    df = df.sample(frac=1, random_state=124)
    train_size = math.ceil(len(df) * train_test_ratio)
    train_list.append(df.head(train_size))
    test_list.append(df.tail(len(df) - train_size))
    
data.groupby(by=['userId']).apply(apply_function)

In [6]:
# Convert train list to dataframe
train_df = pd.concat(train_list)
train_df

Unnamed: 0,userId,movieId,rating,timestamp
120,1,2018,5.0,964980523
163,1,2528,3.0,964982328
56,1,1032,5.0,964982791
74,1,1213,5.0,964982951
224,1,3703,5.0,964981909
...,...,...,...,...
99992,610,6873,3.0,1479544373
100287,610,55995,3.0,1493846528
100728,610,117867,4.0,1493849881
100662,610,107348,3.5,1479544713


In [7]:
# Convert test list to dataframe
test_df = pd.concat(test_list)
test_df

Unnamed: 0,userId,movieId,rating,timestamp
67,1,1136,5.0,964981327
226,1,3740,4.0,964982417
180,1,2716,5.0,964983414
168,1,2596,5.0,964981144
112,1,1805,4.0,964983056
...,...,...,...,...
100496,610,85510,3.0,1493847872
99669,610,2028,5.0,1479545856
99551,610,293,5.0,1479542783
100202,610,46578,5.0,1479542918


In [8]:
# Unique userid in train data
train_df.userId.unique().size

610

In [9]:
# Unique movie ids in train data
train_df.movieId.unique().size

9024

In [10]:
# Check where train data contains all movies from the test set
set(train_df.movieId.unique()) >= set(test_df.movieId.unique())

False

In [11]:
# Number of movies which is in test dataframe but not in train dataframe
len(set(test_df.movieId.unique()).difference(set(train_df.movieId.unique())))

700

In [12]:
# Read and load movies df
movies_df = pd.read_csv(os.path.join(os.getcwd(), 'data', 'ml-latest-small', 'movies.csv'))
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [13]:
# Calculate and print unique genres
unique_genres = set()
for genres in movies_df.genres:
    for genre in genres.split("|"):
        unique_genres.add(genre)

unique_genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

# Tfidf content based recommendation

In [14]:
#Term Frequency - Inverse Document Frequency (TF-IDF) Recommender

class TfidfRecommender:

    def __init__(self, id_col):
        self.id_col = id_col
        self.tokenization_method = "scibert"

        # Initialize other variables used in this class
        self.tf = TfidfVectorizer()
        self.tfidf_matrix = dict()
        self.tokens = dict()
        self.stop_words = frozenset()
        self.recommendations = dict()
        self.top_k_recommendations = pd.DataFrame()

    def __clean_text(self, text, for_BERT=False, verbose=False):
        try:
            # Normalize unicode
            text_norm = unicodedata.normalize("NFC", text)

            # Remove HTML tags, new line, tabs and all punctuation and special characters
            clean = re.sub("<.*?>", "", text_norm)
            clean = clean.replace("\n", " ")
            clean = clean.replace("\t", " ")
            clean = clean.replace("\r", " ")
            clean = clean.replace("Â\xa0", "")  # non-breaking space
            clean = re.sub(r"([^\s\w]|_)+", "", clean)
            
            if for_BERT is False:
                # Lower case
                clean = clean.lower()
        except Exception:
            if verbose is True:
                print("Cannot clean non-existent text")
            clean = ""
        return clean

    def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"):
        
        # Collapse the table such that all descriptive text is just in a single column
        df = df.replace(np.nan, "", regex=True)
        df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1)
        for_BERT = True
        
        # Clean the text in the dataframe
        df[new_col_name] = df[new_col_name].map(lambda x: self.__clean_text(x, for_BERT))
        return df

    def tokenize_text(self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0):
        vectors = df_clean[text_col]

        tf = TfidfVectorizer(analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english",)
        bert_method = "allenai/scibert_scivocab_cased"

        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = BertTokenizer.from_pretrained(bert_method)

        # Loop through each item
        vectors_tokenized = vectors.copy()
        for i in range(0, len(vectors)):
            vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i]))

        # Save to class variable
        self.tf = tf

        return tf, vectors_tokenized

    def fit(self, tf, vectors_tokenized):
        self.tfidf_matrix = tf.fit_transform(vectors_tokenized)

    def get_tokens(self):
        try:
            self.tokens = self.tf.vocabulary_
        except Exception:
            self.tokens = "Run .tokenize_text() and .fit_tfidf() first"
        return self.tokens

    def get_stop_words(self):
        try:
            self.stop_words = self.tf.get_stop_words()
        except Exception:
            self.stop_words = "Run .tokenize_text() and .fit_tfidf() first"
        return self.stop_words

    def __create_full_recommendation_dictionary(self, df_clean):

        # Similarity measure
        cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

        # sorted_idx has the indices that would sort the array.
        sorted_idx = np.argsort(cosine_sim, axis=1)

        data = list(df_clean[self.id_col].values)
        len_df_clean = len(df_clean)

        results = {}
        for idx, row in zip(range(0, len_df_clean), data):
            similar_indices = sorted_idx[idx][: -(len_df_clean + 1) : -1]
            similar_items = [(cosine_sim[idx][i], data[i]) for i in similar_indices]
            results[row] = similar_items[1:]

        # Save to class
        self.recommendations = results

    def __organize_results_as_tabular(self, df_clean, k):
        
        # Initialize new dataframe to hold recommendation output
        item_id = list()
        rec_rank = list()
        rec_score = list()
        rec_item_id = list()

        # For each item
        for _item_id in self.recommendations:
            # Information about the item we are basing recommendations off of
            rec_based_on = tmp_item_id = _item_id

            # Get all scores and IDs for items recommended for this current item
            rec_array = self.recommendations.get(rec_based_on)
            tmp_rec_score = list(map(lambda x: x[0], rec_array))
            tmp_rec_id = list(map(lambda x: x[1], rec_array))

            # Append multiple values at a time to list
            item_id.extend([tmp_item_id] * k)
            rec_rank.extend(list(range(1, k + 1)))
            rec_score.extend(tmp_rec_score[:k])
            rec_item_id.extend(tmp_rec_id[:k])

        # Save the output
        output_dict = {
            self.id_col: item_id,
            "rec_rank": rec_rank,
            "rec_score": rec_score,
            "rec_" + self.id_col: rec_item_id,
        }

        # Convert to dataframe
        self.top_k_recommendations = pd.DataFrame(output_dict)

    def recommend_top_k_items(self, df_clean, k=5):
        
        self.__create_full_recommendation_dictionary(df_clean)
        self.__organize_results_as_tabular(df_clean, k)

        return self.top_k_recommendations

    def __get_single_item_info(self, metadata, rec_id):

        # Return row
        rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])]

        return rec_info

    def __make_clickable(self, address):
        return '<a href="{0}">{0}</a>'.format(address)

    def get_top_k_recommendations(self, metadata, query_id, cols_to_keep=[]):

        # Create subset of dataframe with just recommendations for the item of interest
        df = self.top_k_recommendations.loc[
            self.top_k_recommendations[self.id_col] == query_id
        ].reset_index()

        # Remove id_col of query item
        df.drop([self.id_col], axis=1, inplace=True)

        # Add metadata for each recommended item (rec_<id_col>)
        metadata_cols = metadata.columns.values
        df[metadata_cols] = df.apply(lambda row: self.__get_single_item_info(metadata, row["rec_" + self.id_col]),
            axis=1,)

        # Remove id col added from metadata (already present from self.top_k_recommendations)
        df.drop([self.id_col], axis=1, inplace=True)

        # Rename columns such that rec_ is no longer appended, for simplicity
        df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"})

        # Only keep columns of interest
        if len(cols_to_keep) > 0:
            # Insert our recommendation scoring/ranking columns
            cols_to_keep.insert(0, "similarity_score")
            cols_to_keep.insert(0, "rank")
            df = df[cols_to_keep]

        # Make URLs clickable if they exist
        if "url" in list(map(lambda x: x.lower(), metadata_cols)):
            format_ = {"url": self.__make_clickable}
            df = df.head().style.format(format_)

        return df

In [15]:
# Create object of TfidfRecommender class
tfidf_recommender = TfidfRecommender('movieId')

In [16]:
# clean dataframe
clean_df = tfidf_recommender.clean_dataframe(movies_df, ['genres'])
clean_df

Unnamed: 0,movieId,title,genres,cleaned_text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,AdventureAnimationChildrenComedyFantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,AdventureChildrenFantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,ComedyRomance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,ComedyDramaRomance
4,5,Father of the Bride Part II (1995),Comedy,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,ActionAnimationComedyFantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,AnimationComedyFantasy
9739,193585,Flint (2017),Drama,Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,ActionAnimation


In [17]:
# tokenized dataframe using scibert tokenizer
tfidf_vectorizer, tokenized_df = tfidf_recommender.tokenize_text(clean_df)
tokenized_df

0       advent ##urea ##ni ##mat ##ion ##child ##ren #...
1              advent ##ure ##child ##ren ##fan ##ta ##sy
2                                  come ##dy ##roma ##nce
3                       come ##dy ##dr ##ama ##roma ##nce
4                                               come ##dy
                              ...                        
9737    action ##ani ##mat ##ion ##come ##dy ##fan ##t...
9738                animation ##come ##dy ##fan ##ta ##sy
9739                                             dram ##a
9740                             action ##ani ##mat ##ion
9741                                            come ##dy
Name: cleaned_text, Length: 9742, dtype: object

In [18]:
# Firt model for tokenized data
tfidf_recommender.fit(tfidf_vectorizer, tokenized_df)

In [19]:
# Generate tfidf matrix
tfidf_recommender.tfidf_matrix

<9742x898 sparse matrix of type '<class 'numpy.float64'>'
	with 95629 stored elements in Compressed Sparse Row format>

In [20]:
# Generate top k recommendation
recommended_df = tfidf_recommender.recommend_top_k_items(clean_df, k=10)
recommended_df

Unnamed: 0,movieId,rec_rank,rec_score,rec_movieId
0,1,1,1.0,4886
1,1,2,1.0,65577
2,1,3,1.0,2294
3,1,4,1.0,53121
4,1,5,1.0,3114
...,...,...,...,...
97415,193609,6,1.0,101283
97416,193609,7,1.0,3596
97417,193609,8,1.0,3581
97418,193609,9,1.0,3552


In [21]:
# Top 10 recommendation for each movie
recommended_df

Unnamed: 0,movieId,rec_rank,rec_score,rec_movieId
0,1,1,1.0,4886
1,1,2,1.0,65577
2,1,3,1.0,2294
3,1,4,1.0,53121
4,1,5,1.0,3114
...,...,...,...,...
97415,193609,6,1.0,101283
97416,193609,7,1.0,3596
97417,193609,8,1.0,3581
97418,193609,9,1.0,3552


In [22]:
# Calculate user average rating dictionary
tmp_list = list()
user_avg_rating_dict = dict()
user_to_train_movies_dict = dict()

def apply_function(df: pd.DataFrame):
    df = df.drop(['timestamp'], axis=1)
    user_avg_rating_dict[df.iloc[0, 0]] = df.rating.mean()
    user_to_train_movies_dict[df.iloc[0, 0]] = set(df['movieId'].tolist())
    df.rating = df.rating - df.rating.mean()
    tmp_list.append(df)
    
train_df.groupby(by=['userId']).apply(apply_function)

In [23]:
# Calculate user similarity on test data
tmp_list = list()
# user_avg_rating_dict = dict()
user_similarity_on_test_dict = dict()

def apply_function(df: pd.DataFrame):
    df = df.drop(['timestamp'], axis=1)
    df = df.sort_values(by=['rating'], ascending=False)
    
    movie_id_list = df.loc[:, 'movieId'].tolist()
    movie_id_list = movie_id_list[: min(10, len(movie_id_list))]
    
    unique_movies = set()
    final_list = list()
    
    for movie_id in movie_id_list:
        recommendations = recommended_df[recommended_df['movieId'] == movie_id].loc[:, 'rec_movieId'].tolist()
        similarities = recommended_df[recommended_df['movieId'] == movie_id].loc[:, 'rec_score'].tolist()
        
        for similarity, m_id in zip(similarities, recommendations):
            if (m_id in unique_movies) or (m_id in user_to_train_movies_dict[df.iloc[0, 0]]):
                continue
            final_list.append([similarity, m_id])
            unique_movies.add(m_id)
    
    user_similarity_on_test_dict[df.iloc[0, 0]] = final_list
    
train_df.groupby(by=['userId']).apply(apply_function)

In [24]:
# Calculate number of user in the dict
len(user_similarity_on_test_dict)

610

In [25]:
# Calculate actual rating
actual_rating_on_test_dict = dict()
for index, row in test_df.iterrows():
    if row['userId'] in actual_rating_on_test_dict:
        actual_rating_on_test_dict[row['userId']].append((row['rating'], row['movieId']))
    else:
        tmp_lst = list()
        tmp_lst.append((row['rating'], row['movieId']))
        actual_rating_on_test_dict[row['userId']] = tmp_lst

In [26]:
# sort user and actual ratings based on similarity
for key in user_similarity_on_test_dict.keys():
    user_similarity_on_test_dict[key].sort(reverse=True)

for key in actual_rating_on_test_dict.keys():
    actual_rating_on_test_dict[key].sort(reverse=True)

In [27]:
# Compute top k user recommendation and all test set ratings
user_top_k_recommendation = dict()
actual_top_k_rating = dict()

for key in user_similarity_on_test_dict.keys():
    for idx in range(min(k, len(user_similarity_on_test_dict[key]))):
        if key in user_top_k_recommendation:
            user_top_k_recommendation[key].append(user_similarity_on_test_dict[key][idx][1])
        else:
            tmp_lst = list()
            tmp_lst.append(user_similarity_on_test_dict[key][idx][1])
            user_top_k_recommendation[key] = tmp_lst

for key in actual_rating_on_test_dict.keys():
    for idx in range(len(actual_rating_on_test_dict[key])):
        if key in actual_top_k_rating:
            actual_top_k_rating[key].append(actual_rating_on_test_dict[key][idx][1])
        else:
            tmp_lst = list()
            tmp_lst.append(actual_rating_on_test_dict[key][idx][1])
            actual_top_k_rating[key] = tmp_lst

In [28]:
# Calculate True Positive, False Positive and False Negative

TP = dict()
FP = dict()
FN = dict()

is_first = True

for key in user_top_k_recommendation.keys():
    
    tp = 0
    fp = 0
    fn = 0
    
    for val in user_top_k_recommendation[key]:
        if val in set(actual_top_k_rating[key]):
            tp += 1
        else:
            fp += 1
    
    for val in actual_top_k_rating[key]:
        if val not in set(user_top_k_recommendation[key]):
            fn += 1
    
    TP[key] = tp
    FP[key] = fp
    FN[key] = fn
    

In [29]:
# Calculate precision, recall and f1score
precision = 0
recall = 0
cnt = 0

for key in TP.keys():
    precision += TP[key] / (TP[key] + FP[key])
    recall += TP[key] / (TP[key] + FN[key])
    cnt += 1

precision = precision / cnt
recall = recall / cnt
f1_score = 2 * precision * recall / (precision + recall)

In [30]:
# Print Precision, Recall and F1score
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)

Precision: 0.005901639344262298
Recall: 0.001988074583555684
F1 score: 0.0029742267689254523
