In [1]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [2]:
import itertools
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [3]:
doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

In [4]:
n_gram_range = (3,3)

In [16]:
class DoKeyBERT:
    def __init__(self, doc, n_gram_range):
        self.doc = doc
        self.n_gram_range = n_gram_range
        self.stop_words = 'english'

        self.count = None
        self.candidates = []

        self.model = None
        self.doc_embedding = None
        self.candidate_embeddings = None

        """
        top_n = [5, 10]
        nr_candidates = [10, 20]
        diversity = [0.2, 0.5, 0.7]
        """

    def vectorize(self):
        self.count = CountVectorizer(ngram_range=self.n_gram_range, stop_words=self.stop_words).fit([self.doc])
        self.candidates = self.count.get_feature_names_out()

    def embed(self):
        self.model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        self.doc_embedding = self.model.encode([self.doc])
        self.candidate_embeddings = self.model.encode(self.candidates)

    def max_sum_sim(self, top_n, nr_candidates):
        # 문서와 각 키워드들 간의 유사도
        distances = cosine_similarity(self.doc_embedding, self.candidate_embeddings)

        # 각 키워드들 간의 유사도
        distances_candidates = cosine_similarity(self.candidate_embeddings, 
                                                self.candidate_embeddings)

        # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
        words_idx = list(distances.argsort()[0][-nr_candidates:])
        words_vals = [self.candidates[index] for index in words_idx]
        distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

        # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
        min_sim = np.inf
        candidate = None
        for combination in itertools.combinations(range(len(words_idx)), top_n):
            sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
            if sim < min_sim:
                candidate = combination
                min_sim = sim

        return [words_vals[idx] for idx in candidate]

    def mmr(self, top_n, diversity):
        words = self.candidates
        
        # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
        word_doc_similarity = cosine_similarity(self.candidate_embeddings, self.doc_embedding)

        # 각 키워드들 간의 유사도
        word_similarity = cosine_similarity(self.candidate_embeddings)

        # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
        # 만약, 2번 문서가 가장 유사도가 높았다면
        # keywords_idx = [2]
        keywords_idx = [np.argmax(word_doc_similarity)]

        # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
        # 만약, 2번 문서가 가장 유사도가 높았다면
        # ==> candidates_idx = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10 ... 중략 ...]
        candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

        # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
        # ex) top_n = 5라면, 아래의 loop는 4번 반복됨.
        for _ in range(top_n - 1):
            candidate_similarities = word_doc_similarity[candidates_idx, :]
            target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

            # MMR을 계산
            mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            # keywords & candidates를 업데이트
            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)

        return [words[idx] for idx in keywords_idx]

    def run(self, top_n, nr_candidates, diversity):

        self.vectorize()
        self.embed()

        print("Max Sum Similarity Keywords for top_n={}, nr_candidates={}".format(top_n, nr_candidates))
        print(self.max_sum_sim(top_n, nr_candidates))
        print("MMR Keywords for top_n={}, diversity={}".format(top_n, diversity))
        print(self.mmr(top_n, diversity))



In [17]:
keyBERT = DoKeyBERT(doc, n_gram_range)
top_n = 10
nr_candidates = 20
diversity = 0.5
keyBERT.run(top_n, nr_candidates, diversity)

Max Sum Similarity Keywords for top_n=10, nr_candidates=20
['set training examples', 'training data produces', 'generalize training data', 'supervised learning example', 'analyzes training data', 'machine learning task', 'requires learning algorithm', 'learning function maps', 'supervised learning algorithm', 'learning machine learning']
MMR Keywords for top_n=10, diversity=0.5
['algorithm generalize training', 'supervised learning algorithm', 'learning machine learning', 'learning algorithm analyzes', 'mapping new examples', 'algorithm correctly determine', 'learning algorithm generalize', 'learning function maps', 'algorithm analyzes training', 'requires learning algorithm']
