# Install and import modules

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tqdm.notebook import tqdm
import ast
from matplotlib import pyplot as plt

In [None]:
%%capture
!pip install sentence-transformers
!pip install keybert

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

# Load and preprosess data

In [None]:
df = pd.read_csv('../input/myanimelist-dataset-animes-profiles-reviews/animes.csv')
df.dropna(inplace=True, subset=['synopsis', 'title'])
df = df[~df.uid.duplicated(keep='first')]
df.head()

In [None]:
TITLE = 'Shingeki no Kyojin'
def get_synopsis(title):
    doc = df[df['title'] == title].synopsis.values[0]
    doc = re.sub("[\(\[].*?[\)\]]", "", doc) # Remove the "written by" caption
    doc = doc.replace(u'\n', u'').replace(u'\r', u'')
    doc = nlp(doc)
    return doc
doc = get_synopsis(TITLE)
print(doc)

# Extract keyword candidates

In [None]:
for np in doc.noun_chunks: # use np instead of np.text
    if len(np) > 1:
        print('> ', np)

In [None]:
def get_keyword_candidates(doc):
    # code to recursively combine nouns
    # 'We' is actually a pronoun but included in your question
    # hence the token.pos_ == "PRON" part in the last if statement
    # suggest you extract PRON separately like the noun-chunks above

    index = 0
    nounIndices = []
    for token in doc:
        if token.pos_ == 'NOUN':
            nounIndices.append(index)
        index = index + 1

    print('Nouns found: ', len(nounIndices))

    candidates = []
    for idxValue in nounIndices:
        if not bool(doc[idxValue].left_edge.ent_type_):
            start = doc[idxValue].left_edge.i
        else:
            start = idxValue 

        if not bool(doc[idxValue].right_edge.ent_type_):
            finish = doc[idxValue].right_edge.i+1
        else:
            finish = idxValue + 1

        if finish-start > 0 and finish-start <7:
            span = doc[start : finish]
#             print('>', span)
            candidates.append(span.text)

    return candidates

get_keyword_candidates(doc)

In [None]:
keywords = kw_model.extract_keywords(doc.text, candidates=candidates, 
                              use_mmr=True, diversity=0.7)

keywords