# **Label every cluster with KeyBERT**

### **1. Import**

In [None]:
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from collections import defaultdict
import sys
import os
import json
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
# Level up one level directory to add app the the allowed routes
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from app.utils.base_dir import BASE_DIR

  from .autonotebook import tqdm as notebook_tqdm


### **2. Load the model**

In [3]:
#embed_model = SentenceTransformer("allenai/scibert_scivocab_uncased")
embed_model = SentenceTransformer("all-MiniLM-L6-v2") # using a mini model for performance
kw_model = KeyBERT(model=embed_model) 

### **3. Extract the keywords for cluster**

**3.1 Function to extract the keywords per cluster**

In [7]:
import gc
def extract_keywords_keybert_clusterwise(
        texts,  # the corpus of the cluster
        top_n=10,       #number of keywords that return
        keyphrase_ngram_range=(1,2),    # use bigrams
        use_mmr=True,   # avoid redudant words
        diversity=0.6,  # similitud and viriety
        nr_candidates=20,
        stop_words='english'
    ):
    
    kws = kw_model.extract_keywords(texts,
        keyphrase_ngram_range = keyphrase_ngram_range,
        stop_words = stop_words,
        top_n = top_n,
        use_mmr = use_mmr,
        diversity = diversity,
        nr_candidates = nr_candidates)
    # kws -> list of (keyword, score)
    kws = [k for k,_ in kws]
    gc.collect()
    return kws
    

**3.2 Load the data**

In [5]:
no_clean_data_path = BASE_DIR/'data'/'processed'/'noCleanProcessedData.json' 
clusters_path = BASE_DIR/'data'/'processed'/'clusters.json' 

no_clean_data_array = []
no_clean_data = {}
clusters = {}
with open(no_clean_data_path, 'r') as f:
    no_clean_data_array = json.load(f)

with open(clusters_path, 'r') as f:
    clusters = json.load(f)

def parse_to_dict(data:list):
    for element in data:
        for k, v in element.items():
            no_clean_data[k] = v
parse_to_dict(no_clean_data_array)



**3.3 Get cluster corpus**

In [None]:
# like {11 : ["text 01", "text 02"]}
clusterNumber_corpusText = defaultdict(str)
MAX_DOCS = 4 # total 5 cause i start in 0

for cluster_number, titles_list in clusters.items():
    for i, title in enumerate(titles_list):
        i+=1
        if i <= MAX_DOCS:
            clusterNumber_corpusText[cluster_number] += no_clean_data[title]
        else: 
            break

**3.2 Extract keywords for all custer's**

In [None]:
def extract_keywords():
    response = {} # is a dict with {number_cluster:[keyword01, keyword02,...]}
    for c_number, list_text in clusterNumber_corpusText.items():
        kwrds = extract_keywords_keybert_clusterwise(list_text)
        response[c_number] = kwrds 
    return response

res = extract_keywords()

**3.3 Export to Json**

In [None]:
import json
print(res)
kwr_path = BASE_DIR/'data'/'processed'/'keywords.json' 
with open(kwr_path, "w", encoding="utf-8") as f:
    json.dump(res, f, ensure_ascii=False, indent=2)