In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

import re
import pickle
import os
import time

from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckkok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocess_data(use_saved_file: bool = False):
    if (use_saved_file): 
        df = pd.read_pickle("./review_df.pkl") 
        return df

    # Reading the data
    code_df = pd.read_csv("../../data/anime_codes.csv")

    # Read the review data
    reviews_dir = "../../data/reviews"
    all_reviews = list()

    for index, review_doc in enumerate(os.listdir(reviews_dir)):
        current_dict = dict()
            
        # Get the current anime's code
        current_dict['code'] = review_doc.split('.')[0]
        
        # Get the current anime's reviews
        f = open(os.path.join(reviews_dir, review_doc), 'r', encoding="utf-8")
        current_dict['review'] = f.read()
        f.close()
        
        all_reviews.append(current_dict)

    # Create a dataframe of the anime codes and their respective reviews
    review_df = pd.DataFrame(all_reviews)

    # Match the name and rating of animes in code_df to the anime reviews dataframe
    review_df['code']=review_df['code'].astype(int)
    df = pd.merge(review_df, code_df, on='code')


    #Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description
    def _removeNonAscii(s):
        return "".join(i for i in s if  ord(i)<128)

    def make_lower_case(text):
        return text.lower()

    def remove_stop_words(text):
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        return text

    def remove_html(text):
        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', text)

    def remove_punctuation(text):
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        text = " ".join(text)
        return text

    df['cleaned'] = df['review'].apply(_removeNonAscii)
    df['cleaned'] = df.cleaned.apply(func = make_lower_case)
    df['cleaned'] = df.cleaned.apply(func = remove_stop_words)
    df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
    df['cleaned'] = df.cleaned.apply(func=remove_html)
    df.name = df.name.apply(lambda x: re.sub(r"\s\s*", " ", re.sub(r"[\-\_]", " ", x)) )
    with open("./review_df.pkl", "wb") as filehandle:
        pickle.dump(df, filehandle)
        filehandle.close()

    return df

## Exploring Explainability

In [3]:
df = preprocess_data(use_saved_file=True)

In [4]:
#splitting the description into words
corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

# Using the Google pretrained Word2Vec Model 
# If using for the first time, download and store in ../../data/ 
# (link: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz)

EMBEDDING_FILE = '../../data/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Training our corpus with Google Pretrained Model

google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

#model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)



(0, 0)

In [5]:
# Generate the average word2vec for the each set of anime reviews
def vectors(x: pd.DataFrame):

    # Creating a list for storing the vectors (description into vectors)
    global array_embeddings
    array_embeddings = []

    # Reading the each anime review set
    for line in df['cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model[word]
                else:
                    avgword2vec = avgword2vec + google_model[word]

        if avgword2vec is not None:
            avgword2vec = avgword2vec / count

            array_embeddings.append(avgword2vec)

# Calling the function vectors
vectors(df)

# finding cosine similarity for the vectors
cosine_similarities = cosine_similarity(array_embeddings, array_embeddings)

  app.launch_new_instance()


In [19]:
def recommendations(title: str, df: pd.DataFrame, cosine_similarities: bool):
    
    # taking the title and rating to store in new data frame called animes
    animes = df[['name', 'rating']]

    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['name']).drop_duplicates()# Recommending the Top 5 similar animes
    # drop all duplicate occurrences of the labels 
    indices = indices.groupby(indices.index).first()

    idx = indices[title]
    sim_scores = sorted(list(enumerate(cosine_similarities[idx])), key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    anime_indices = [i[0] for i in sim_scores]
    recommend = animes.iloc[anime_indices]
    
    count = 0
    for index, row in recommend.iterrows():
        print('{}. {}, similarity: {}, rating: {}'.format(count+1, row['name'], sim_scores[count][1], row['rating']))
        count += 1

In [20]:
google_model.most_similar(positive=['woman'], topn=10)

  """Entry point for launching an IPython kernel.


[('man', 0.7664012312889099),
 ('girl', 0.7494640946388245),
 ('teenager', 0.631708562374115),
 ('lady', 0.6288785934448242),
 ('mother', 0.607630729675293),
 ('boy', 0.5975908041000366),
 ('she', 0.5641393661499023),
 ('person', 0.5470173358917236),
 ('housewife', 0.5463822484016418),
 ('victim', 0.545007586479187)]

In [44]:
title = 'ranma ½ super'

# taking the title and rating to store in new data frame called animes
animes = df[['name', 'rating']]

#Reverse mapping of the index
indices = pd.Series(df.index, index = df['name']).drop_duplicates()# Recommending the Top 5 similar animes
# drop all duplicate occurrences of the labels 
indices = indices.groupby(indices.index).first()

idx = indices[title]

In [45]:
line = df['cleaned'].iloc[idx]
current_reviews = []
for word in line.split():
    if word in google_model.wv.vocab and word not in current_reviews:
        current_reviews.append(word)
google_model.most_similar(positive=current_reviews, topn=10)

  


[('really', 0.6926707029342651),
 ('just', 0.6821541786193848),
 ('anyway', 0.6122572422027588),
 ('defintiely', 0.5991133451461792),
 ('actually', 0.5837223529815674),
 ('certainly', 0.5828970670700073),
 ('nice', 0.5794240236282349),
 ('probably', 0.5778967142105103),
 ('so', 0.5721061825752258),
 ('maybe', 0.5714657306671143)]

In [39]:
animes.head(20)

Unnamed: 0,name,rating
0,cowboy bebop,8.79
1,uchuu kaizoku captain herlock,7.72
2,dragon ball super saiya jin zetsumetsu keikaku,6.73
3,top wo nerae 2 diebuster,7.69
4,toriko,7.59
5,nurarihyon no mago sennen makyou,8.02
6,star ocean ex,6.62
7,tenchi muyou in love,7.44
8,naruto x ut,7.42
9,kämpfer für die liebe,6.52


Looks like it's majoritily run by words that occur often. That's a good reason to begin using TFIDF (althought it takes a while to train)