In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

import re
import pickle
import os
import time

import yake
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
def preprocess_data(use_saved_file: bool = False):
    if (use_saved_file): 
        df = pd.read_pickle("./synopsys_df.pkl") 
        return df

    # Reading the data
    df = pd.read_csv("../../data/anime_data.csv")
   
    # Match the name and rating of animes in code_df to the anime reviews dataframe
    df['code'] = df['code'].astype(int)    
    df['synopsis'] = df['synopsis'].astype(str)
    
    df = df[["code", "name", "synopsis", "rating"]] #keep only the relevant columns to save space
    
    #Utitlity functions for removing synopsis that are two short, Non-ASCII characters, converting lower case, removing stop words, html and punctuation from description
    def removeShortSynopsis(text):
        if len(text.split(". ")) < 2:            
            return None
        else:
            return text
    def remove_line_ending(text):
        return re.sub(r" \[Written by MAL Rewrite\]", "", text)
        
    def _removeNonAscii(s):
        return "".join(i for i in s if ord(i)<128)

    def make_lower_case(text):
        return text.lower()
    
    stops = set(stopwords.words("english"))
#     # a list of stops words that I found after some analysis
#     f = open("./custom_stop_words.txt", "r", encoding="utf-8")
#     custom_stops = set(map(lambda x: x[:-2], f.readlines()))
#     stops.union(custom_stops)
    
    def remove_stop_words(text):
        text = text.split()
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        return text

    def remove_html(text):
        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', text)

    def remove_punctuation(text):
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        text = " ".join(text)
        return text

    df['cleaned'] = df['synopsis'].apply(_removeNonAscii)
    df['cleaned'] = df.cleaned.apply(func = remove_line_ending)
    df['cleaned'] = df.cleaned.apply(func = make_lower_case)
    df['cleaned'] = df.cleaned.apply(func = remove_stop_words)
    df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
    df['cleaned'] = df.cleaned.apply(func=remove_html)
    df.name = df.name.apply(lambda x: re.sub(r"\s\s*", " ", re.sub(r"[\-\_]", " ", x)))    
    with open("./synopsys_df.pkl", "wb") as filehandle:
        pickle.dump(df, filehandle)
        filehandle.close()

    return df

In [115]:
def train_word2vec(df: pd.DataFrame, use_saved_file: bool=False):
    if (use_saved_file): 
        with open('w2v_cosine_sim.data', 'rb') as filehandle:
            # read the data as binary data stream
            cosine_similarities = pickle.load(filehandle)
            return cosine_similarities
    
    #splitting the description into words
    corpus = []
    for words in df['cleaned']:
        corpus.append(words.split())

    # Using the Google pretrained Word2Vec Model 
    # If using for the first time, download and store in ../../data/ 
    # (link: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz)

    EMBEDDING_FILE = '../../data/GoogleNews-vectors-negative300.bin.gz'
    google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

    # Training our corpus with Google Pretrained Model

    google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
    google_model.build_vocab(corpus)

    #model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

    google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

    google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

    # Generate the average word2vec for the each set of anime reviews
    def vectors(x: pd.DataFrame):
        
        # Creating a list for storing the vectors (description into vectors)
        global array_embeddings
        array_embeddings = []
        kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, dedupFunc="seqm", windowsSize=1, top=20)
        
        # Reading the each anime review set
        for line in df['cleaned']:
            avgword2vec = None
            count = 0
            #extract keywords from the text
            keywords = kw_extractor.extract_keywords(line)
            words = []
            for word, _ in keywords:
                words.extend(word.split())
            words = list(set(words))
            
            for word in words:
                if word in google_model.wv.vocab:
                    count += 1
                    if avgword2vec is None:
                        avgword2vec = google_model[word]
                    else:
                        avgword2vec = avgword2vec + google_model[word]
                    
            if avgword2vec is not None:
                avgword2vec = avgword2vec / count
            
                array_embeddings.append(avgword2vec)

    # Calling the function vectors
    vectors(df)

    # finding cosine similarity for the vectors
    cosine_similarities = cosine_similarity(array_embeddings, array_embeddings)

    with open('w2v_cosine_sim.data', 'wb') as filehandle:
        # store the data as binary data stream
        pickle.dump(cosine_similarities, filehandle)

    return cosine_similarities

In [116]:
def recommendations(title: str, df: pd.DataFrame, cosine_similarities: bool):
    
    # taking the title and rating to store in new data frame called animes
    animes = df[['name', 'rating']]

    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['name']).drop_duplicates()# Recommending the Top 5 similar animes
    # drop all duplicate occurrences of the labels 
    indices = indices.groupby(indices.index).first()

    idx = indices[title]
    sim_scores = sorted(list(enumerate(cosine_similarities[idx])), key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    anime_indices = [i[0] for i in sim_scores]
    recommend = animes.iloc[anime_indices]
    
    count = 0
    for index, row in recommend.iterrows():
        print('{}. {}, similarity: {}, rating: {}'.format(count+1, row['name'], sim_scores[count][1], row['rating']))
        count += 1

In [109]:
df = preprocess_data(use_saved_file=False)

In [117]:
# if you need to retrain or don't have the saved .data file, set use_saved_file to False
cosine_similarities = train_word2vec(df, use_saved_file=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [132]:
recommendations('tengen toppa gurren lagann', df, cosine_similarities)

1. danball senki wars, similarity: 0.7767904996871948, rating: G - All Ages
2. no game no life, similarity: 0.776049792766571, rating: PG-13 - Teens 13 or older
3. arslan senki tv tsuioku no shou dakkan no yaiba, similarity: 0.7740839719772339, rating: R - 17+ (violence & profanity)
4. mahou no star magical emi, similarity: 0.7735641002655029, rating: G - All Ages
5. kekkai sensen beyond zapp renfro ingaouhouchuu baccardio no shizuku, similarity: 0.7727901935577393, rating: R - 17+ (violence & profanity)


In [121]:
df[df.name.str.contains("ore ga ojousama")]

Unnamed: 0,code,name,synopsis,rating,cleaned
3948,25099,ore ga ojousama gakkou ni shomin sample toshit...,Kimito Kagurazaka is a commoner with a fetish ...,PG-13 - Teens 13 or older,kimito kagurazaka commoner fetish men s muscle...
4458,31797,ore ga ojousama gakkou ni shomin sample toshit...,Kujou-san no Do-S Soudanshitsu Anime-ban short...,PG-13 - Teens 13 or older,kujou san do s soudanshitsu anime ban short an...


In [124]:
kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, dedupFunc="seqm", windowsSize=1, top=20)     

In [125]:
keywords = kw_extractor.extract_keywords(df.iloc[3948].synopsis)
words = []
for word, _ in keywords:
    words.extend(word.split())

In [127]:
set(words)

{'academy',
 'all-girls',
 'assumptions',
 'club',
 'commoner',
 'elite',
 'girls',
 'hakua',
 'jinryou',
 'kagurazaka',
 'karen',
 'kimito',
 'life',
 'make',
 'men',
 'muscles',
 'prefers',
 'sample',
 'school',
 'seikain',
 'shiodome'}