# BHT Data Applications project
# Automatic Anime recommendation Algorithm
### This project aims to create an algorithm that can determine what anime to recommend to a user.
##### Authors: Rashmi Di Michino and Antonin Mathubert

The 320000 users and 16000 animes dataset was taken from https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020 <br>
We are going to use this dataset to build a model that can recommend an anime based on the animes that the user is watching, has dropped, has kept on hold or put on their watching list.

### 1. Importing and parsing the data
First, we want to import all of our available data in a suitable manner so it is treatable for the next steps of the project.<br><br>
Here, we are going to propose two methods to do that. The first one is very slow but the "clean" way to read json data. Because we don't actually need all the data from the files, we are going to use a second hand made method that is significantly faster.

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import time
import os
import re

In [None]:
dataset_chunks = pd.read_csv("dataset/anime/animelist.csv", chunksize=10000)
chunks = []
for chunk in dataset_chunks:
    chunks.append(chunk)

dataset = pd.concat(chunks[:10], ignore_index=True)
    

In [35]:
dataset_chunks = pd.read_csv("C:/Users/rashm/OneDrive/Desktop/data_applications_project/julius/anime_dataset/animelist.csv", chunksize=10000)

chunks = []
for chunk in dataset_chunks:
    chunks.append(chunk)

dataset = pd.concat(chunks[:5000], ignore_index=True)

In [36]:
dataset.drop(['watching_status', 'rating', 'watched_episodes'], axis=1, inplace=True)
dataset = dataset[dataset['user_id'] < 61960].reset_index()

In [37]:
dataset.head(100)

Unnamed: 0,index,user_id,anime_id
0,0,0,67
1,1,0,6702
2,2,0,242
3,3,0,4898
4,4,0,21
...,...,...,...
95,95,1,3972
96,96,1,481
97,97,1,22199
98,98,1,6547


In [38]:
dataset = dataset.pivot(index='user_id', columns='anime_id', values='anime_id')

In [39]:
dataset[dataset.notnull()] = True

  dataset[dataset.notnull()] = True


In [40]:
dataset = dataset.fillna(False)

  dataset = dataset.fillna(False)


In [41]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets  = apriori(dataset, use_colnames=True)

frequent_itemsets 

Unnamed: 0,support,itemsets
0,0.554824,(20)
1,0.526738,(226)
2,0.73544,(1535)
3,0.596375,(1575)
4,0.513185,(2167)
5,0.601471,(4224)
6,0.506391,(5081)
7,0.622168,(5114)
8,0.596148,(6547)
9,0.592033,(9253)


In [42]:
frequent_itemsets = apriori(dataset, min_support=0.3, use_colnames=True)

frequent_itemsets 

Unnamed: 0,support,itemsets
0,0.451305,(1)
1,0.554824,(20)
2,0.409245,(21)
3,0.458116,(30)
4,0.446437,(121)
...,...,...
4114,0.302924,"(30276, 19815, 11757, 16498, 20507)"
4115,0.313378,"(30276, 19815, 11757, 22319, 16498)"
4116,0.303204,"(30276, 19815, 11757, 16498, 31964)"
4117,0.303169,"(30276, 11757, 22319, 16498, 20507)"


In [47]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(1),(1535),0.451305,0.735440,0.394397,0.873904,1.188273,0.062489,2.098080,0.288762
1,(1),(5114),0.451305,0.622168,0.363562,0.805579,1.294794,0.082774,1.943374,0.414942
2,(20),(1535),0.554824,0.735440,0.479601,0.864420,1.175377,0.071561,1.951312,0.335169
3,(1735),(20),0.456540,0.554824,0.430415,0.942776,1.699234,0.177116,7.779540,0.757185
4,(21),(1535),0.409245,0.735440,0.355052,0.867577,1.179670,0.054076,1.997831,0.257814
...,...,...,...,...,...,...,...,...,...,...
8829,"(30276, 11757, 22319)","(16498, 31964)",0.373034,0.432376,0.302136,0.809942,1.873234,0.140845,2.986579,0.743524
8830,"(31964, 30276, 11757)","(16498, 22319)",0.364455,0.483418,0.302136,0.829009,1.714891,0.125952,3.021112,0.655929
8831,"(31964, 30276, 22319)","(16498, 11757)",0.345596,0.553388,0.302136,0.874246,1.579807,0.110887,3.551482,0.560833
8832,"(31964, 11757, 22319)","(16498, 30276)",0.339853,0.483873,0.302136,0.889021,1.837301,0.137691,4.650652,0.690336


In [52]:
rules[rules["antecedents"]==frozenset({30276, 11757,22319})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5173,"(30276, 11757, 22319)",(1535),0.373034,0.73544,0.335983,0.900676,1.224676,0.061639,2.6636,0.292611
6314,"(30276, 11757, 22319)",(5114),0.373034,0.622168,0.316214,0.847681,1.362464,0.084124,2.480536,0.424323
6637,"(30276, 11757, 22319)",(6547),0.373034,0.596148,0.305866,0.81994,1.375397,0.083482,2.242875,0.435331
6863,"(30276, 11757, 22319)",(9253),0.373034,0.592033,0.317107,0.850075,1.435858,0.096259,2.721143,0.484161
7086,"(30276, 11757, 22319)",(10620),0.373034,0.549046,0.310856,0.833318,1.517757,0.106043,2.705472,0.544101
7325,"(30276, 11757, 22319)",(16498),0.373034,0.666398,0.356558,0.95583,1.434322,0.107968,7.55266,0.482972
7436,"(30276, 11757, 22319)",(19815),0.373034,0.536701,0.325985,0.873873,1.628231,0.125777,3.673286,0.615403
7458,"(30276, 11757, 22319)",(20507),0.373034,0.502749,0.314131,0.842095,1.674981,0.126588,3.149059,0.642744
7466,"(30276, 11757, 22319)",(31964),0.373034,0.478007,0.312747,0.838387,1.753921,0.134434,3.229896,0.685602
8364,"(30276, 11757, 22319)","(16498, 1535)",0.373034,0.563614,0.324427,0.869696,1.54307,0.114179,3.348979,0.561341


In [45]:
len(chunks)

10923