"**This set of codes looks at a recommendation that is based on random selection.**

In [1]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
from textblob import TextBlob
import re
from spellchecker import SpellChecker
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import warnings
import timeit
import os
import random
import pickle

In [2]:
if not sys.warnoptions:
    warnings.simplefilter('ignore')
    warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth',-1)

## 1. Import Processed Anime Data

In [3]:
df = pd.read_csv("processed_data/processed_anime.csv")
print('Data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))

Data has 4808 rows and 92 columns


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4808 entries, 0 to 4807
Data columns (total 92 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                4808 non-null   int64  
 1   Unnamed: 0           4808 non-null   int64  
 2   Unnamed: 0.1         4808 non-null   int64  
 3   Title                4808 non-null   object 
 4   URL                  4808 non-null   object 
 5   English              2862 non-null   object 
 6   Synonyms             3075 non-null   object 
 7   Japanese             4801 non-null   object 
 8   Type                 4808 non-null   object 
 9   Episodes             4585 non-null   float64
 10  Status               4808 non-null   object 
 11  Aired                4808 non-null   object 
 12  Premiered            4808 non-null   object 
 13  Broadcast            4808 non-null   object 
 14  Producers            4808 non-null   object 
 15  Licensors            4808 non-null   o

In [5]:
# duplicate dataset
df_n = df.copy()

## 2. Calculate Hit Rate @ 10 based on Random Selection of Animes to be Recommended


**import dataset**

In [6]:
df_review = pd.read_csv('processed_data/processed_reviews.csv')
df_review.shape

(75921, 7)

In [7]:
df_review.columns

Index(['uid', 'profile', 'anime_uid', 'text', 'score', 'scores', 'link'], dtype='object')

In [8]:
# check the distribution of ratings
df_review['score'].describe()

count    75921.000000
mean     7.353038    
std      2.219829    
min      0.000000    
25%      6.000000    
50%      8.000000    
75%      9.000000    
max      10.000000   
Name: score, dtype: float64

**a. Train Test Split**

Apply the leave-one-out methodology to do train-test split. For each user, the most recent review is used as the test set. The most recent review is indicated by the larger uid. The reamining ratings would be used in the train dataset. This will help to ensure no data leakage.

In [9]:
# create test dataset
test = df_review.loc[df_review.groupby('profile')['uid'].idxmax()]
test.shape

(15363, 7)

In [10]:
# create train dataset
train = df_review[~(df_review['uid'].isin(test['uid'].tolist()))]
train.shape

(60558, 7)

**b. Identify animes which have been watched by users as well as those which have not been watched by users**

In [11]:
# create a unique list of anime_uid
anime_uid = list(set(df['anime_uid']))
len(anime_uid)

4808

In [12]:
# create a unique list of profiles from train dataset
profile_list = list(set(train['profile'].tolist()))
len(profile_list)

15363

In [13]:
# identify list of animes watched 
# identify the list of animes not watched. As the not watched list is huge, 
# we randomly sample 99 animes not watched items + the one watched item in the test data to form the not watched list.
# we repeat this random sampling 10 times to ensure robustness while accounting for limited computational resource.

watched_list = []
not_watched_list_1 = []
not_watched_list_2 = []
not_watched_list_3 = []
not_watched_list_4 = []
not_watched_list_5 = []

for user in profile_list:
    
    # subset watched animes - i.e. those with ratings
    u_watched = train[train['profile']==user]['anime_uid'].tolist()
    watched_list.append(u_watched)
    
    # identify animes not watched i.e. those without ratings
    u_not_watched = list(set(anime_uid) - set(u_watched) - set(test[test['profile']==user]['anime_uid'].tolist()))
    
    # set seed to control reproducibility of sampling
    random.seed(2345)
    u_not_watched_1 = random.sample(u_not_watched, 99) + test[test['profile']==user]['anime_uid'].tolist()
    not_watched_list_1.append(u_not_watched_1)
    
    random.seed(2346)
    u_not_watched_2 = random.sample(u_not_watched, 99) + test[test['profile']==user]['anime_uid'].tolist()
    not_watched_list_2.append(u_not_watched_2)
    
    random.seed(2347)
    u_not_watched_3 = random.sample(u_not_watched, 99) + test[test['profile']==user]['anime_uid'].tolist()
    not_watched_list_3.append(u_not_watched_3)
    
    random.seed(2348)
    u_not_watched_4 = random.sample(u_not_watched, 99) + test[test['profile']==user]['anime_uid'].tolist()
    not_watched_list_4.append(u_not_watched_4)
    
    random.seed(2349)
    u_not_watched_5 = random.sample(u_not_watched, 99) + test[test['profile']==user]['anime_uid'].tolist()
    not_watched_list_5.append(u_not_watched_5)


In [14]:
# check len of watched_list and not watched list
print(len(watched_list))
print(len(not_watched_list_1))
print(len(not_watched_list_5))

15363
15363
15363


In [15]:
# check 
print(len(watched_list[0]))
print(len(not_watched_list_1[0]))
print(len(not_watched_list_5[0]))

2
100
100


**c. Randomly Select Animes to be Recommended to each user**

In [18]:
# Create a function to compute the predicted ratings for not watched list for each user

def get_recommends(profile, not_watched, train_dataset):
    
    '''
    Args:
        profile - list of unique profiles
        not_watched - list of not watched animes for all users
        train_dataset - train data
    
    Returns:
        recommend - list of 10 recommended animes

    '''
    
    # initialise list to store predictions
    recommend = []
    
    for i in range(len(profile)):
        
        anime_list = not_watched[i]
        
        recommend_idx = np.random.randint(100, size=10)
        recommended_animes = [anime_list[idx] for idx in recommend_idx]
        recommend.append(recommended_animes)
        
    return recommend


In [19]:
# Generate recommended items for each user

start_time = timeit.default_timer()

recommended_1 = get_recommends(profile_list, not_watched_list_1, train)
print('done')
recommended_2 = get_recommends(profile_list, not_watched_list_2, train)
print('done')
recommended_3 = get_recommends(profile_list, not_watched_list_3, train)
print('done')
recommended_4 = get_recommends(profile_list, not_watched_list_4, train)
print('done')
recommended_5 = get_recommends(profile_list, not_watched_list_5, train)
print('done')

elapsed = timeit.default_timer() - start_time
print('Time taken:', elapsed)

done
done
done
done
done
Time taken: 1.7177855999999565


In [22]:
# check
recommended_1[1]

[40357, 40357, 10578, 33245, 148, 36740, 39960, 38161, 2921, 33850]

**d. Evaluate Hit Rate @ 10 for Proposed Recommender**

In [23]:
# create function to evaluate hit rate @ 10

def hit_rate(animes_rec, test_dataset, profile):
    
    '''
    Args:
        animes_rec - the nest list of top 10 recommended animes for each user 
        test_dataset - test data
        profile - list of unique profiles
    
    Returns:
        hit_rate - hit rate @ 10
    '''
    
    hit = 0
    
    for i in range(len(animes_rec)):
    
        user = profile[i]
        
        recommended_animes = animes_rec[i]
    
        # check if the test data is found in the randomly selected top 10 recommended animes
        anime_watched = test_dataset[test_dataset['profile']==user]['anime_uid']

        if anime_watched.values in recommended_animes:
            hit += 1
        else:
            hit = hit
    
    hit_rate = round((hit/len(profile))*100,2)
    
    return hit_rate

In [25]:
# Generate hit rate @ 10 for the different samples

start_time = timeit.default_timer()

hit_rate_1 = hit_rate(recommended_1, test, profile_list)
hit_rate_2 = hit_rate(recommended_2, test, profile_list)
hit_rate_3 = hit_rate(recommended_3, test, profile_list)
hit_rate_4 = hit_rate(recommended_4, test, profile_list)
hit_rate_5 = hit_rate(recommended_5, test, profile_list)

elapsed = timeit.default_timer() - start_time
print('Time taken:', elapsed)


Time taken: 85.18538860000001


In [26]:
# print hit rates
print('Hit Rate @ 10 based on not_watched_list_1: {}%'.format(hit_rate_1))
print('Hit Rate @ 10 based on not_watched_list_2: {}%'.format(hit_rate_2))
print('Hit Rate @ 10 based on not_watched_list_3: {}%'.format(hit_rate_3))
print('Hit Rate @ 10 based on not_watched_list_4: {}%'.format(hit_rate_4))
print('Hit Rate @ 10 based on not_watched_list_5: {}%'.format(hit_rate_5))



Hit Rate @ 10 based on not_watched_list_1: 9.71%
Hit Rate @ 10 based on not_watched_list_2: 9.54%
Hit Rate @ 10 based on not_watched_list_3: 9.55%
Hit Rate @ 10 based on not_watched_list_4: 9.67%
Hit Rate @ 10 based on not_watched_list_5: 9.59%


In [27]:
# print avg hit rates
avg_hit_rate = (hit_rate_1 + hit_rate_2 + hit_rate_3 + hit_rate_4 + hit_rate_5)/5 
print('Average Hit Rate @ 10 based on the 5 different sets of randomly sampled not watched list: {}%'.format(avg_hit_rate))


Average Hit Rate @ 10 based on the 5 different sets of randomly sampled not watched list: 9.612%
