In [None]:
pip install keybert



In [None]:
from keybert import KeyBERT

In [None]:
!pip install sentence_transformers



In [None]:
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
doc = "All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.</br>Everyone is entitled to all the rights and freedoms set forth in this Declaration, without distinction of any kind, such as race, colour, sex, language, religion, political or other opinion, national or social origin, property, birth or other status. Furthermore, no distinction shall be made on the basis of the political, jurisdictional or international status of the country or territory to which a person belongs, whether it be independent, trust, non-self-governing or under any other limitation of sovereignty.</br>All are equal before the law and are entitled without any discrimination to equal protection of the law. All are entitled to equal protection against any discrimination in violation of this Declaration and against any incitement to such discrimination."

In [None]:
doclist = doc.split('</br>')

In [None]:
doclist

['All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.',
 'Everyone is entitled to all the rights and freedoms set forth in this Declaration, without distinction of any kind, such as race, colour, sex, language, religion, political or other opinion, national or social origin, property, birth or other status. Furthermore, no distinction shall be made on the basis of the political, jurisdictional or international status of the country or territory to which a person belongs, whether it be independent, trust, non-self-governing or under any other limitation of sovereignty.',
 'All are equal before the law and are entitled without any discrimination to equal protection of the law. All are entitled to equal protection against any discrimination in violation of this Declaration and against any incitement to such discrimination.']

In [None]:
doc_wiki = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

In [None]:
count_wiki = CountVectorizer(stop_words="english").fit([doc_wiki])
candidates_wiki = count_wiki.get_feature_names_out()

print('키워드 개수 :',len(candidates_wiki))
print('키워드 다섯개만 출력 :',candidates_wiki[:5])

키워드 개수 : 50
키워드 다섯개만 출력 : ['algorithm' 'allow' 'analyzes' 'based' 'bias']


In [None]:
model_wiki = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding_wiki = model_wiki.encode([doc_wiki])
candidate_embeddings_wiki = model_wiki.encode(candidates_wiki)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
top_n = 5
distances_wiki = cosine_similarity(doc_embedding_wiki, candidate_embeddings_wiki)
keywords_wiki = [candidates_wiki[index] for index in distances_wiki.argsort()[0][-top_n:]]
print(keywords_wiki)

['mapping', 'class', 'training', 'algorithm', 'learning']


In [None]:
#불용어제외키워드후보추출
count = CountVectorizer(stop_words = "english").fit([doc])
candidates = count.get_feature_names_out()
#문서,키워드후보 임베딩벡터 추출
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)
#상위5개 
top = 5
cossim = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in cossim.argsort()[0][-top:]]

In [None]:
keywords

['brotherhood', 'independent', 'sovereignty', 'free', 'freedoms']

In [None]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    cossim = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    cossim_candidates = cosine_similarity(candidate_embeddings, candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(cossim.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    cossim_candidates = cossim_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([cossim_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, 5, 10)

['self', 'belongs', 'brotherhood', 'free', 'freedoms']

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, 5, 20)

['national', 'kind', 'law', 'religion', 'free']

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, 5, 30)

['birth', 'sex', 'law', 'religion', 'free']

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, 5, 7)

['equal', 'brotherhood', 'sovereignty', 'free', 'freedoms']

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, 5, 5)

['brotherhood', 'independent', 'sovereignty', 'free', 'freedoms']

In [None]:
def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    keywords_idx = [np.argmax(word_doc_similarity)]


    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, 5, 0.7)

['freedoms', 'religion', 'race', 'sex', 'birth']

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, 5, 0.3)

['freedoms', 'brotherhood', 'free', 'sovereignty', 'equal']

In [None]:
def keyext(doc, n_gram_range, top, func):
  
  #불용어제외키워드후보추출
  count = CountVectorizer(ngram_range=n_gram_range, stop_words = "english").fit([doc])
  candidates = count.get_feature_names_out()
  #문서,키워드후보 임베딩벡터 추출
  model = SentenceTransformer('distilbert-base-nli-mean-tokens')
  doc_embedding = model.encode([doc])
  candidate_embeddings = model.encode(candidates)
  #상위5개 
  cossim = cosine_similarity(doc_embedding, candidate_embeddings)
  keywords = [candidates[index] for index in cossim.argsort()[0][-top:]]

  if func == 'mss':
    a = int(input("top n 입력:"))
    b = int(input("nr_candidate 입력:"))
    result = max_sum_sim(doc_embedding, candidate_embeddings, candidates, a, b)
    return result
  elif func == 'mmr':
    a = int(input("top n 입력:"))
    b = int(input("diversity 입력:"))
    result = mmr(doc_embedding, candidate_embeddings, candidates, a, b)
    return result
  else:
    return keywords

In [None]:
keyext(doc, 5)

['brotherhood', 'independent', 'sovereignty', 'free', 'freedoms']

In [None]:
keyext(doc, 10)

['self',
 'belongs',
 'dignity',
 'trust',
 'equal',
 'brotherhood',
 'independent',
 'sovereignty',
 'free',
 'freedoms']

In [None]:
keyext(doc,5,'mss')

top n 입력:5
nr_candidate 입력:10


['self', 'belongs', 'brotherhood', 'free', 'freedoms']

In [None]:
keyext(doclist[0], 5, 'mss')

top n 입력:5
nr_candidate 입력:10


['conscience', 'beings', 'birth', 'colour', 'belongs']

In [None]:
keyext(doclist[1], 5, 'mss')

top n 입력:5
nr_candidate 입력:10


['incitement', 'forth', 'international', 'person', 'conscience']

In [None]:
keyext(doclist[0], 5, 'mmr')

top n 입력:5
diversity 입력:10


['brotherhood', 'human', 'act', 'born', 'reason']

In [None]:
keyext(doclist[0], (3,3), 5, 'mmr')

top n 입력:5
diversity 입력:10


['born free equal',
 'human beings born',
 'reason conscience act',
 'dignity rights endowed',
 'act spirit brotherhood']