# Anime Recommendations Using a Topic Model
* we use the topic model to recommend animes
  * we should create two topic models that calculate animes from users and caluculates users from animes

# Core Concepts
* find some similar users to userA
  * calculate the feature vectors of userA from the anime-rating of userA
  * find similar users from the feature vectors of userA
* find animes associated with these similar users

# Pre-Processing Data
* [create tfidf of anime side](https://www.kaggle.com/wordroid/create-tfidf-of-anime-side)
* [create tfidf of user side](https://www.kaggle.com/wordroid/create-tfidf-of-user-side-another-way)

# Related Notebooks
* [Love Live!, Bungou Stray Dogs, Evangelion, Saenai](https://www.kaggle.com/wordroid/love-live-bungou-stray-dogs-evangelion-saenai)
* [Load and Confirm tfidf of anime side](https://www.kaggle.com/wordroid/load-and-confirm-tfidf-of-anime-side)
  * [similar anime to Mobile Suit Gundam](https://www.kaggle.com/wordroid/load-and-confirm-tfidf-of-anime-side#Mobile-Suit-Gundam)
  * [similar anime to 'Kimi no Na wa.'](https://www.kaggle.com/wordroid/load-and-confirm-tfidf-of-anime-side#Kimi-no-Na-wa.)
* [Load and Confirm tfidf of user side](https://www.kaggle.com/wordroid/load-and-confirm-tfidf-of-user-side)

In [None]:
!pip install git+https://github.com/darecophoenixx/wordroid.sblo.jp

In [None]:
%matplotlib inline
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

# Prepare Data

In [None]:
ls -la ../input

In [None]:
dir_data_src = '../input/anime-recommendations-database'
os.listdir(dir_data_src)

## load anime.csv

In [None]:
anime_csv = pd.read_csv(os.path.join(dir_data_src, 'anime.csv'))
print(anime_csv.shape)
anime_csv.index = anime_csv.anime_id.values
anime_csv.head()

In [None]:
def Anime_title2id(titlename, csv=anime_csv):
    try:
        return csv.query('name==@titlename').anime_id.values[0]
    except:
        return 9999999

Anime_title2id('Kimi no Na wa.'), Anime_title2id('Fullmetal Alchemist: Brotherhood'), Anime_title2id('Gintama°'), Anime_title2id('PL_ANIME')

In [None]:
def Anime_id2title(anime_id, csv=anime_csv):
    try:
        return csv.query('anime_id==@anime_id').name.values[0]
    except:
        return '*****'

Anime_id2title(32281), Anime_id2title(5114), Anime_id2title(28977), Anime_id2title(9999999)

## load rating.csv

In [None]:
'''
load rating.csv
'''
rating_csv = pd.read_csv(os.path.join(dir_data_src, 'rating.csv'))
print(rating_csv.shape)
rating_csv.head()

In [None]:
'''
`Yuri!!! on Ice` doesn't exist in rating_csv
'''
print(Anime_title2id('Yuri!!! on Ice'))
rating_csv.query('anime_id==32995')

## create rating_csv2
delete rating == -1

In [None]:
rating_csv2 = rating_csv.loc[rating_csv.rating.values != -1]
rating_csv2.shape
rating_csv2.head(10)

## anime no-rated

In [None]:
'''
show anime no rated
'''
anime_id_norated = anime_csv.anime_id.values[~np.isin(anime_csv.anime_id.values, rating_csv2.groupby('anime_id').size().keys().values)]
print(anime_csv.loc[anime_id_norated].shape)

# Load Pre-Processing Data

In [None]:
src_tfidf_anime = '../input/create-tfidf-of-anime-side'
os.listdir(src_tfidf_anime)

In [None]:
corpus_csr_anime = scipy.sparse.load_npz(os.path.join(src_tfidf_anime, 'corpus_csr.npz'))
tfidf_anime = gensim.models.TfidfModel.load(os.path.join(src_tfidf_anime, 'tfidf'))
dic_user = gensim.corpora.Dictionary.load(os.path.join(src_tfidf_anime, 'dic_user'))
dic_anime = gensim.corpora.Dictionary.load(os.path.join(src_tfidf_anime, 'dic_anime'))

corpus_csr_anime, tfidf_anime, dic_user, dic_anime

In [None]:
src_tfidf_user = '../input/create-tfidf-of-user-side-another-way'
os.listdir(src_tfidf_user)

In [None]:
corpus_csr_user = scipy.sparse.load_npz(os.path.join(src_tfidf_user, 'corpus_csr.npz'))
tfidf_user = gensim.models.TfidfModel.load(os.path.join(src_tfidf_user, 'tfidf'))
#dic_user = gensim.corpora.Dictionary.load(os.path.join(src_tfidf_user, 'dic_user'))
#dic_anime = gensim.corpora.Dictionary.load(os.path.join(src_tfidf_user, 'dic_anime'))

corpus_csr_user, tfidf_user

In [None]:
def Anime2id(title):
    return dic_anime.token2id['anime_id-'+str(Anime_title2id(title))]

Anime2id('Kimi no Na wa.')

# Create MatrixSimilarity

In [None]:
from feature_eng import neg_smpl

In [None]:
sim_anime = neg_smpl.MySparseMatrixSimilarity(corpus_csr_anime, num_features=max(dic_user.keys())+1, tfidf=tfidf_anime)
sim_anime

In [None]:
sim_user = neg_smpl.MySparseMatrixSimilarity(corpus_csr_user, num_features=max(dic_anime.keys())+1, tfidf=tfidf_user)
sim_user

# Create query of 'user_id-1'

In [None]:
dic_user.doc2bow(['user_id-1'])

example : user_id-1 and user_id-2

In [None]:
dic_user.doc2bow(['user_id-1', 'user_id-2'])

In [None]:
'''
'user_id-2' is encoded to 10469
'''
dic_user.token2id['user_id-1'], dic_user.token2id['user_id-2']

If you want to increase the weight of "user_id-2", increase the frequency.

In [None]:
dic_user.doc2bow(['user_id-1', 'user_id-2', 'user_id-2', 'user_id-2'])

# Calculate the feature vectors of 'user_id-1'
The features mentioned here is a list of (anime title, score). We think that the combination of "Which anime is evaluated by which weight" represents the user.

In [None]:
'''
show animes rated by 'user_id-1'
'''
pd.merge(rating_csv2.query('user_id==1'), anime_csv, on='anime_id')

In [None]:
'''
calc the feature vectors of 'user_id-1'
'''
sim_anime.num_best = 10

anime_rating = sim_anime[dic_user.doc2bow(['user_id-1'])]
anime_rating

Since the anime title is encoded, it is converted to the actual title below. (Sort by score)

In [None]:
for idx, rating in anime_rating:
    #print(dic_anime[idx])
    print(rating, Anime_id2title(re.sub('anime_id-', '', dic_anime[idx])))

# Find similar users to 'user_id-1'

In [None]:
%%time
'''
get similar 30 user to `user_id-1`
'''
sim_user.num_best = 30
user_rating = sim_user[anime_rating]
user_rating

'User_id-1' is the fifth.

In [None]:
for idx, wgt in user_rating:
    print(wgt, dic_user[idx])

# Is the animation list of "user_id-46667" close to "user_id-1"?
Is the animation rating of "user_id-46667" close to "user_id-1"?

In [None]:
pd.merge(rating_csv2.query('user_id==46667'), anime_csv, on='anime_id')

In [None]:
sim_anime.num_best = 30
anime_rating = sim_anime[dic_user.doc2bow(['user_id-46667'])]
anime_rating

for idx, rating in anime_rating:
    #print(dic_anime[idx])
    print(rating, Anime_id2title(re.sub('anime_id-', '', dic_anime[idx])))

# Find animes associated with these similar users

In [None]:
'''
get anime list
'''
sim_anime.num_best = 50
anime_rating = sim_anime[user_rating]
anime_rating

In [None]:
'''
We can recommend anime other than the user-1 has already watched.
'''
for idx, wgt in anime_rating:
    print(wgt, Anime_id2title(int(re.sub('anime_id-', '', dic_anime[idx]))))

In [None]:
'''
anime list user_id-1 already watched
'''
pd.merge(rating_csv.query('user_id==1'), anime_csv, on='anime_id')