In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckkok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
import pickle
import os

In [3]:
# Reading the data
code_df = pd.read_csv("../../data/anime_codes.csv")

In [4]:
code_df.head()

Unnamed: 0,code,name,rating
0,5114,fullmetal_alchemist__brotherhood,9.22
1,11061,hunter_x_hunter_2011,9.12
2,28977,gintama°,9.11
3,9969,gintama,9.09
4,38524,shingeki_no_kyojin_season_3_part_2,9.07


In [5]:
# Read the review data
reviews_dir = "../../data/reviews"
all_reviews = list()

for index, review_doc in enumerate(os.listdir(reviews_dir)):
    current_dict = dict()
        
    # Get the current anime's code
    current_dict['code'] = review_doc.split('.')[0]
    
    # Get the current anime's reviews
    f = open(os.path.join(reviews_dir, review_doc), 'r', encoding="utf-8")
    current_dict['review'] = f.read()
    f.close()
    
    all_reviews.append(current_dict)

# Create a dataframe of the anime codes and their respective reviews
review_df = pd.DataFrame(all_reviews)
review_df.shape

(877, 2)

In [6]:
# Match the name and rating of animes in code_df to the anime reviews dataframe
review_df['code']=review_df['code'].astype(int)
df = pd.merge(review_df, code_df, on='code')
print(df.shape)
df.head()

(877, 4)


Unnamed: 0,code,review,name,rating
0,1,People who know me know that I'm not a fan of ...,cowboy_bebop,8.79
1,1000,There is a reason this is considered to be one...,uchuu_kaizoku_captain_herlock,7.72
2,1002,Many people will know the name of Hideaki Anno...,top_wo_nerae_2_diebuster,7.68
3,10033,"""Cooking is as masculine as judo, kickboxing o...",toriko,7.59
4,10049,After finishing the first series of Nurarihyon...,nurarihyon_no_mago__sennen_makyou,8.02


## Cleaning The Data

In [7]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text


In [8]:
df['cleaned'] = df['review'].apply(_removeNonAscii)
df['cleaned'] = df.cleaned.apply(func = make_lower_case)
df['cleaned'] = df.cleaned.apply(func = remove_stop_words)
df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
df['cleaned'] = df.cleaned.apply(func=remove_html)
df.head()

Unnamed: 0,code,review,name,rating,cleaned
0,1,People who know me know that I'm not a fan of ...,cowboy_bebop,8.79,people know know i m fan episodic anime series...
1,1000,There is a reason this is considered to be one...,uchuu_kaizoku_captain_herlock,7.72,reason considered one greatest anime series ev...
2,1002,Many people will know the name of Hideaki Anno...,top_wo_nerae_2_diebuster,7.68,many people know name hideaki anno particular ...
3,10033,"""Cooking is as masculine as judo, kickboxing o...",toriko,7.59,cooking masculine judo kickboxing tae kwan do ...
4,10049,After finishing the first series of Nurarihyon...,nurarihyon_no_mago__sennen_makyou,8.02,finishing first series nurarihyon mago bit rel...


## Applying Average Word2Vec

In [9]:
#splitting the description into words

corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

In [10]:
# Using the Google pretrained Word2Vec Model (from: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz)

EMBEDDING_FILE = '../../data/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training our corpus with Google Pretrained Model

google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

#model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

(0, 0)

In [11]:
google_model.most_similar('attack')

  """Entry point for launching an IPython kernel.


[('attacks', 0.7638030052185059),
 ('assault', 0.5970098376274109),
 ('bombing', 0.5777696371078491),
 ('assaults', 0.5646902322769165),
 ('attacking', 0.5615357160568237),
 ('attacked', 0.5553597211837769),
 ('counterattack', 0.5205312371253967),
 ('ambush', 0.5107647180557251),
 ('raid', 0.502953290939331),
 ('blasts', 0.49935752153396606)]

In [12]:
# Generate the average word2vec for the each set of anime reviews

def vectors(x):
    
    # Creating a list for storing the vectors (description into vectors)
    global array_embeddings
    array_embeddings = []

    # Reading the each anime review set
    for line in df['cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model[word]
                else:
                    avgword2vec = avgword2vec + google_model[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            array_embeddings.append(avgword2vec)

In [13]:
# Calling the function vectors

vectors(df)

# finding cosine similarity for the vectors

cosine_similarities = cosine_similarity(array_embeddings, array_embeddings)

# taking the title and rating to store in new data frame called animes
animes = df[['name', 'rating']]



In [14]:
#Reverse mapping of the index
indices = pd.Series(df.index, index = df['name']).drop_duplicates()# Recommending the Top 5 similar animes
# drop all duplicate occurrences of the labels 
indices = indices.groupby(indices.index).first()

In [15]:
def recommendations(title):
         
    idx = indices[title]
    sim_scores = sorted(list(enumerate(cosine_similarities[idx])), key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    anime_indices = [i[0] for i in sim_scores]
    recommend = animes.iloc[anime_indices]
    
    count = 0
    for index, row in recommend.iterrows():
        print('{}. {}, similarity: {}, rating: {}'.format(count+1, row['name'], sim_scores[count][1], row['rating']))
        count += 1

In [19]:
recommendations('cowboy_bebop')

1. michiko_to_hatchin, similarity: 0.9917200803756714, rating: 7.85
2. cowboy_bebop__tengoku_no_tobira, similarity: 0.9910752773284912, rating: 8.39
3. trigun, similarity: 0.9908796548843384, rating: 8.24
4. baccano, similarity: 0.9905235171318054, rating: 8.42
5. samurai_champloo, similarity: 0.9898200631141663, rating: 8.5


## Applying TF-IDF Word2Vec

In [20]:
#Building TFIDF model and calculate TFIDF score

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf.fit(df['cleaned'])

# Getting the words from the TF-IDF model

tfidf_list = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
tfidf_feature = tfidf.get_feature_names() # tfidf words/col-names

In [27]:
# # Building TF-IDF Word2Vec 

# # Storing the TFIDF Word2Vec embeddings
# tfidf_vectors = []; 
# line = 0;

# # for each anime's set of reviews
# for desc in corpus: 
#     print('loading: {}/{}'.format(line, len(corpus)))
#     # Word vectors are of zero length (Used 300 dimensions)
#     sent_vec = np.zeros(300) 
#     # num of words with a valid vector in the anime reviews
#     weight_sum =0; 
#     # for each word in the anime reviews
#     for word in desc: 
#         if word in google_model.wv.vocab and word in tfidf_feature:
#             vec = google_model.wv[word]
#             tf_idf = tfidf_list[word] * (desc.count(word) / len(desc))
#             sent_vec += (vec * tf_idf)
#             weight_sum += tf_idf
#     if weight_sum != 0:
#         sent_vec /= weight_sum
#     tfidf_vectors.append(sent_vec)
#     line += 1

# # Save file cause it's a really long process to build all the embeddings
# with open('tfidf_vectors.data', 'wb') as filehandle:
#     # store the data as binary data stream
#     pickle.dump(tfidf_vectors, filehandle)

loading: 0/877
loading: 1/877
loading: 2/877
loading: 3/877
loading: 4/877
loading: 5/877
loading: 6/877
loading: 7/877
loading: 8/877
loading: 9/877
loading: 10/877
loading: 11/877
loading: 12/877
loading: 13/877
loading: 14/877
loading: 15/877
loading: 16/877
loading: 17/877
loading: 18/877
loading: 19/877
loading: 20/877
loading: 21/877
loading: 22/877
loading: 23/877
loading: 24/877
loading: 25/877
loading: 26/877
loading: 27/877
loading: 28/877
loading: 29/877
loading: 30/877
loading: 31/877
loading: 32/877
loading: 33/877
loading: 34/877
loading: 35/877
loading: 36/877
loading: 37/877
loading: 38/877
loading: 39/877
loading: 40/877
loading: 41/877
loading: 42/877
loading: 43/877
loading: 44/877
loading: 45/877
loading: 46/877
loading: 47/877
loading: 48/877
loading: 49/877
loading: 50/877
loading: 51/877
loading: 52/877
loading: 53/877
loading: 54/877
loading: 55/877
loading: 56/877
loading: 57/877
loading: 58/877
loading: 59/877
loading: 60/877
loading: 61/877
loading: 62/877
lo

KeyboardInterrupt: 

In [32]:
with open('tfidf_vectors.data', 'rb') as filehandle:
    # read the data as binary data stream
    tfidf_vectors = pickle.load(filehandle)

In [33]:
#Recommending top 5 similar animes

def recommendations(title):
    
    # finding cosine similarity for the vectors

    cosine_similarities = cosine_similarity(tfidf_vectors,  tfidf_vectors)
    
    animes = df[['name', 'rating']]
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['name']).drop_duplicates()
    # drop all duplicate occurrences of the labels 
    indices = indices.groupby(indices.index).first()
         
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    anime_indices = [i[0] for i in sim_scores]
    recommend = animes.iloc[anime_indices]
    
    count = 0
    for index, row in recommend.iterrows():
        print('{}. {}, similarity: {}, rating: {}'.format(count+1, row['name'], sim_scores[count][1], row['rating']))
        count += 1

In [36]:
recommendations('naruto__shippuuden_movie_5_-_blood_prison')

1. naruto__shippuuden_movie_6_-_road_to_ninja, similarity: 0.9334548115853434, rating: 7.69
2. mobile_suit_gundam_iii__encounters_in_space, similarity: 0.8377889660557561, rating: 7.77
3. rakuen_tsuihou, similarity: 0.8292637521261761, rating: 7.41
4. hakuouki_movie_1__kyoto_ranbu, similarity: 0.8042539994639673, rating: 7.69
5. gintama_movie_2__kanketsu-hen_-_yorozuya_yo_eien_nare, similarity: 0.801945827099659, rating: 8.97


In [35]:
animes.head(20)

Unnamed: 0,name,rating
0,cowboy_bebop,8.79
1,uchuu_kaizoku_captain_herlock,7.72
2,top_wo_nerae_2_diebuster,7.68
3,toriko,7.59
4,nurarihyon_no_mago__sennen_makyou,8.02
5,tenchi_muyou_in_love,7.44
6,fate_zero,8.36
7,break_blade_6__doukoku_no_toride,7.72
8,ranma_½__chou_musabetsu_kessen_ranma_team_vs_d...,7.39
9,mahou_shoujo_lyrical_nanoha__the_movie_2nd_as,8.19
