In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools


In [91]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [92]:
def max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [119]:
doc ='''In this paper we propose a novel self-supervised
approach of keywords and keyphrases retrieval and extraction
by an end-to-end deep learning approach, which is trained by
contextually self-labelled corpus. Our proposed approach is
novel to use contextual and semantic features to extract the
keywords and has outperformed the state of the art. Through
the experiment the proposed approach has been proved to be
better in both semantic meaning and quality than the existing
popular algorithms of keyword extraction. In addition, we
propose to use contextual features from bidirectional
transformers to automatically label short-sentence corpus with
keywords and keyphrases to build the ground truth. This
process avoids the human time to label the keywords and do not
need any prior knowledge. To the best of our knowledge, our
published dataset in this paper is a fine domain-independent
corpus of short sentences with labelled keywords and
keyphrases in the NLP community.


'''

In [120]:
n_gram_range = (1, 3)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [121]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [122]:
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [123]:
print(keywords)

['propose novel self', 'we propose novel', 'keyword extraction', 'algorithms of keyword', 'of keyword extraction']


In [124]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=13) 

['popular algorithms of',
 'keyphrases retrieval',
 'propose novel',
 'existing popular algorithms',
 'we propose novel']

In [125]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.4)

['of keyword extraction',
 'existing popular algorithms',
 'we propose novel',
 'outperformed the state',
 'deep learning approach']

In [126]:
notes = '''
- L'algo récupère les mots-clés avec plus ou moins de succès en fonction du document, du nombre de keywords qu'on cherche (1-3), du paramètre indicant la diversité dans les algos mmr et max_sum_min, et de si les stop-words sont gardés ou pas.
- Les paramètres nr_candidates = 13 et diversity = 0.4 respectivement pour max_sum_min et mmr sont un bon compromis pour equilibrer entre la variété des mots-clés et leur pertinence.
- En général les mots-clés que BERT extrait ne correspondent pas toujours aux mots les plus pertinents decrivant le mieux le document, mais sont souvent aussi des mots-clés qui pourraient demander une definition ou une explication si le lecteur n'es pas familiarisé avec le domaine sur lequel traite le document

'''