# Preliminaries

In [1]:
import json
from keybert import KeyBERT
import random
from sklearn.feature_extraction.text import CountVectorizer

# Reading and Vectorizing GENIA collection

In [2]:
genia_path = '../0-data-preprocessed/GENIAcorpus3.02-preprocessed.json'

with open(genia_path, 'r') as c:
  genia = json.loads(c.read())

genia_str = ' '.join(genia)

In [3]:
# Making sure we use every word in the genia collection for vocab.
pre_vocab = []
for i in range(len(genia)):
  pre_vocab.append(genia[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))

In [4]:
# Define the function that will let us use all unique words in the genia file as
# vocab.

def analyzer_custom(doc):
  return doc.split()

In [5]:
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)

# Retrieving the KeyBERT rankings

In [6]:
# Set random seed for reproducibility
random.seed(20230807)

In [7]:
# Taking the first 5000 keywords according to KeyBERT's rankings (it takes a while)
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(genia_str, keyphrase_ngram_range=(1, 1), stop_words=None, vectorizer=counter, top_n=5000)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [9]:
# This cell writes the KeyBERT rankings to a json file, uncomment to rewrite.
keybert_scores_name = 'keybert-scores.json'
with open(keybert_scores_name, 'w') as outfile:
  json.dump(keywords, outfile)