# BHT Data Applications project
# Automatic Anime recommendation Algorithm
### This project aims to create an algorithm that can determine what anime to recommend to a user.
##### Authors: Rashmi Di Michino and Antonin Mathubert

The 320000 users and 16000 animes dataset was taken from https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020 <br>
We are going to use this dataset to build a model that can recommend an anime based on the animes that the user is watching, has dropped, has kept on hold or put on their watching list.

### 1. Importing and parsing the data
First, we want to import all of our available data in a suitable manner so it is treatable for the next steps of the project.<br><br>
Here, we are going to propose two methods to do that. The first one is very slow but the "clean" way to read json data. Because we don't actually need all the data from the files, we are going to use a second hand made method that is significantly faster.

In [1]:
from mlxtend.frequent_patterns import apriori, association_rules
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
#import cupy as cp
import random
import time
import os
import re

In [None]:
dataset_chunks = pd.read_csv("dataset/anime/animelist.csv", chunksize=10000)
chunks = []
for chunk in dataset_chunks:
    chunks.append(chunk)
    
chunks = random.sample(chunks, int(len(chunks) / 3))

dataset = pd.concat(chunks, ignore_index=True)
dataset = dataset.astype({'user_id': "int32", 'anime_id': 'int32', "watching_status": "int16"})

dataset_chunks = None
chunks = None

In [2]:
dataset_chunks = pd.read_csv("C:/Users/rashm/OneDrive/Desktop/data_applications_project/julius/anime_dataset/animelist.csv", chunksize=10000)

chunks = []
for chunk in dataset_chunks:
    chunks.append(chunk)
    
dataset = pd.concat(chunks, ignore_index=True)
dataset = dataset.astype({'user_id': "int32", 'anime_id': 'int32', "watching_status": "int16"})

In [3]:
dataset.drop(['rating', 'watched_episodes'], axis=1, inplace=True)
dataset = dataset[(dataset['anime_id'] < 10000) & (dataset['user_id'] < 20000)]
dataset = dataset[(dataset['user_id'] != 61960) & (dataset['watching_status'] != 4)]

In [None]:
len(dataset[(dataset['anime_id'] < 10000) & (dataset['user_id'] < 20000)])

2509211

In [None]:
display(dataset.head(100))
len(dataset)

In [4]:
dataset = dataset.drop("watching_status", axis=1)

In [5]:
dataset = dataset.pivot(index='user_id', columns='anime_id', values='anime_id')

In [None]:
dataset = np.array(dataset.values)

In [None]:
dataset = np.nan_to_num(dataset, nan=0)

In [None]:
dataset = np.where(dataset != 0, 1, dataset)

In [6]:
dataset[dataset.notnull()] = True

In [7]:
dataset = dataset.fillna(False)

In [8]:
frequent_itemsets  = apriori(dataset, use_colnames=True, min_support=0.2)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.441340,(1)
1,0.266044,(6)
2,0.285095,(19)
3,0.507933,(20)
4,0.352182,(21)
...,...,...
7561,0.202361,"(4224, 9253, 6547, 4181, 2167, 5114)"
7562,0.207082,"(4224, 9253, 9989, 6547, 4181, 2167)"
7563,0.202361,"(4224, 9253, 9989, 6547, 2167, 5081)"
7564,0.201976,"(4224, 9253, 9989, 6547, 2167, 5114)"


In [9]:
rules = association_rules(frequent_itemsets)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(6),(1),0.266044,0.441340,0.216195,0.812629,1.841278,0.098779,2.981573,0.622516
1,(47),(1),0.253692,0.441340,0.205051,0.808267,1.831394,0.093086,2.913736,0.608285
2,(1),(1535),0.441340,0.715784,0.372989,0.845130,1.180706,0.057086,1.835193,0.273957
3,(6),(1535),0.266044,0.715784,0.228109,0.857408,1.197859,0.037678,1.993216,0.225051
4,(6),(1575),0.266044,0.588800,0.214494,0.806232,1.369279,0.057846,2.122123,0.367445
...,...,...,...,...,...,...,...,...,...,...
13622,"(4224, 6547, 2904, 5114, 1535)","(9253, 1575)",0.223772,0.432720,0.201921,0.902355,2.085308,0.105091,5.809628,0.670492
13623,"(9253, 6547, 2904, 5114, 1535)","(4224, 1575)",0.241779,0.414329,0.201921,0.835150,2.015669,0.101746,3.552749,0.664564
13624,"(4224, 2904, 6547, 9253)","(5114, 1535, 1575)",0.249849,0.407521,0.201921,0.808174,1.983146,0.100103,3.088626,0.660868
13625,"(4224, 5114, 2904, 9253)","(1535, 6547, 1575)",0.248531,0.368158,0.201921,0.812459,2.206820,0.110423,3.369079,0.727721


In [17]:
rules[rules["antecedents"]==frozenset({4224, 9989, 9253, 6547, 5081})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
13346,"(4224, 9989, 9253, 6547, 5081)",(1535),0.236234,0.715784,0.206808,0.875436,1.223045,0.037715,2.281684,0.238776
13544,"(4224, 9989, 9253, 6547, 5081)",(1575),0.236234,0.5888,0.201098,0.851267,1.445764,0.062003,2.764675,0.403689
13590,"(4224, 9989, 9253, 6547, 5081)",(2167),0.236234,0.499259,0.202361,0.856612,1.715767,0.084419,3.492203,0.546201
