In [None]:
from bs4 import BeautifulSoup
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
import numpy as np
from operator import itemgetter
import pandas as pd
import requests
from sklearn.model_selection import KFold
import time

def getData(filepath):
    data = pd.read_csv(filepath)
    return data

In [None]:
def filterData(data):
    
    #Full list without zero scores
    #return data.loc[data['score'] > 0]

    #List with zero scores and a status of 1, 2, or 6 (see MAL-Scraper)
    #status = {1, 2, 6}
    #return data.loc[data['status'].isin(status)]

    #List with zero scores and a status of 1 or 2
    #return data.loc[data['status'] <= 2]

    #List without zero scores and a status of 2
    return data.loc[(data['status'] == 2) & (data['score'] > 0)]

In [None]:
def buildDataset(data):
    dataset = Dataset()
    dataset.fit(data['user'].values, data['anime_id'].values)
    return dataset

In [None]:
def buildModel(data, dataset):
    #Building the interactions matrix and the user features matrix
    (interactions, weights) = dataset.build_interactions(zip(data['user'].values,
                                                             data['anime_id'].values,
                                                             data['score'].values))
    model = LightFM(loss='bpr')
    model.fit(interactions, sample_weight=weights, epochs=10)
    
    return model

In [None]:
def kFold(data, dataset, k):
    train_auc_list = []
    test_auc_list = []

    kf = KFold(n_splits = k, shuffle = True, random_state = 1)
    for train, test in kf.split(data):

        train_set = data.iloc[train]
        test_set = data.iloc[test]

        #Building the training interactions matrix and the user features matrix
        (train_interactions, train_weights) = dataset.build_interactions(
                                                                 zip(train_set['user'].values,
                                                                 train_set['anime_id'].values,
                                                                 train_set['score'].values))
        #Building the testing interactions matrix and the user features matrix
        (test_interactions, test_weights) = dataset.build_interactions(
                                                                 zip(test_set['user'].values,
                                                                 test_set['anime_id'].values,
                                                                 test_set['score'].values))

        train_model = LightFM(loss='bpr')
        train_model.fit(train_interactions, sample_weight=train_weights, epochs=10)

        train_auc = auc_score(train_model, train_interactions).mean()
        train_auc_list.append(train_auc)
        test_auc = auc_score(train_model, test_interactions, train_interactions=train_interactions).mean()
        test_auc_list.append(test_auc)
    return (train_auc_list, test_auc_list)

In [None]:
def buildMappings(dataset):
    mappings = dataset.mapping()
    # Dict of the form {username : LightFM userid}
    users = mappings[0]
    # Dict of the form {LightFM animeid : MAL animeid}
    animes = {v: k for k, v in mappings[2].items()}
    animes_list = np.array([*animes.keys()])
    return (users, animes_list, mappings[2], animes)

In [None]:
def getTitle(anime_ids):
    base_url = "https://myanimelist.net/anime/"
    titles_list = []
    
    for anime_id in anime_ids:
        for i in range(10):
            url = base_url + str(anime_id)
            try:
                page = requests.get(url)
                soup = BeautifulSoup(page.text, "lxml")
                title = soup.title.text.split(" - ")[0].strip('\n')
                image_url = soup.find('img', attrs={"class":'ac'})['src']
            except (requests.ConnectionError, AttributeError) as e:
                #To comply with rate limiting
                time.sleep(5)
                continue
                

            titles_list.append((title, url, image_url))
            break
        
    return titles_list

In [None]:
#Has an alternative error handling method
#Faster, but may not return all of the results
def getTitleAlt(anime_ids):
    base_url = "https://myanimelist.net/anime/"
    titles_list = []
    
    for anime_id in anime_ids:
        #To comply with rate limiting
        time.sleep(5)
        url = base_url + str(anime_id)
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.text, "lxml")
            title = soup.title.text.split(" - ")[0].strip('\n')
            image_url = soup.find('img', attrs={"class":'ac'})['src']            
        except (requests.ConnectionError, AttributeError) as e:
            error_string = ("There was an error in accessing the page."
                            " You can access the page manually with the url:"
                            " {}".format(url))
            titles_list.append((error_string, e))
            continue
        titles_list.append((title, url, image_url))
       
    return titles_list

In [None]:
def getTopN(user, n, data, model, mappings):
    user_id = mappings[0][user]

    known_positives = data.loc[data['user'] == user]
    known_positives = known_positives['anime_id'].values
    for i in range(len(known_positives)):
        known_positives[i] = mappings[2][known_positives[i]]

    unwatched_anime = np.setxor1d(known_positives, mappings[1])

    scores = model.predict(user_id, unwatched_anime)
    scores = list(zip(unwatched_anime, scores))
    scores.sort(key=itemgetter(1))

    topN = [scores[i][0] for i in range(n)]
    topN = [mappings[3][i] for i in topN]
    
    topN = getTitle(topN)
    return topN

In [None]:
def buildTest():
    start = time.time()
    data = getData('data.csv')
    clean_data = filterData(data)
    dataset = buildDataset(clean_data)
    
    start1 = time.time()
    (train_auc_list, test_auc_list) = kFold(clean_data, dataset, 5)
    train_auc_list = np.array(train_auc_list, dtype=np.float64)
    test_auc_list = np.array(test_auc_list, dtype=np.float64)
    
    avg_train_auc_list = np.mean(train_auc_list)
    avg_test_auc_list = np.mean(test_auc_list)
    var_train_auc_list = np.var(train_auc_list)
    var_test_auc_list = np.var(test_auc_list)

    print("Average collaborative filtering train AUC: {}".format(avg_train_auc_list))    
    print("Average collaborative filtering test AUC: {}".format(avg_test_auc_list))
    print("Collaborative filtering train AUC variance: {}".format(var_train_auc_list))    
    print("Collaborative filtering test AUC variance: {}".format(var_test_auc_list))
    
    end = time.time()
    
    print("Runtime of kFold is: {} seconds.".format(end - start1))
    print("Total runtime is: {} seconds.".format(end - start))

buildTest()

In [None]:
def build(user, n, file):
    start = time.time()
    data = getData(file)
    clean_data = filterData(data)
    dataset = buildDataset(clean_data)
    model = buildModel(clean_data, dataset)
    mappings = buildMappings(dataset)
    
    start1 = time.time()
    topN = getTopN(user, n, clean_data, model, mappings)
    print(topN)
    end = time.time()
    print("Runtime of getTopN is: {} seconds.".format(end - start1))
    print("Total runtime is: {} seconds.".format(end - start))
    
build("FinalReality56", 10, 'data.csv')