In [1]:
import re
import time
import pickle
import numpy as np
import scipy
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
animes_df = pd.read_csv("../../data/anime_data.csv", encoding="utf-8")

In [6]:
#eliminate that anoying ending message
animes_df['synopsis'] = animes_df.synopsis.apply(lambda x: re.sub(r" \[Written by MAL Rewrite\]", "", str(x)))
#elimnitate any anime with a synopsis that has less than 2 sentences
animes_df = animes_df[animes_df.synopsis.map(lambda x: len(x.split(". ")) >= 1)]

In [7]:
synopsis_corpus = [synopsis.split(". ") for synopsis in animes_df.synopsis.tolist()]
codes_list = [code for code in animes_df.code.tolist()]

In [8]:
corpus_embeddings = []
for synopsis in synopsis_corpus:
    synopsus_embedding = []
    for sentence in synopsis:
        synopsus_embedding.append(model.encode(sentence))
    corpus_embeddings.append(sum(synopsus_embedding))    

In [9]:
query = synopsis_corpus[6]
embeddings = []
for sentence in query:
    embeddings.append(model.encode(sentence))
embedding = sum(embeddings)

In [10]:
closest_n = 5
distances = scipy.spatial.distance.cdist([embedding], corpus_embeddings, "cosine")[0]

results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")

for idx, distance in results[1:closest_n]:
    print("https://myanimelist.net/anime/" + str(codes_list[idx]), "(Score: %.4f)" % (1-distance))





Query: ['After helping revive the legendary vampire Kiss-shot Acerola-orion Heart-under-blade, Koyomi Araragi has become a vampire himself and her servant', 'Kiss-shot is certain she can turn him back into a human, but only once regaining her full power', 'Araragi has hunted down the three vampire hunters that defeated Kiss-shot and retrieved her limbs to return her to full strength', 'However, now that Araragi has almost accomplished what he’s been fighting for this whole time, he has to consider if this is what he really wants', 'Once he revives this powerful immortal vampire, there is no telling what she might do, and there would be no way of stopping her', 'But there is more to the story that Araragi doesn’t understand', 'If a newborn vampire like him could defeat the hunters, how did they overpower Kiss-shot? Can he trust her to turn him back to a human? And how is that even possible in the first place? Araragi is at his limit but he must come to a decision, and it may not be 

In [11]:
np.save(open ("./synopsis_embeddings.npy", 'wb'), corpus_embeddings)

In [12]:
pickle.dump(codes_list, open("./anime_codes.pkl", "wb"))