In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
from keybert import KeyBERT

In [7]:
def load_pipeline():
    
    tokenizer = AutoTokenizer.from_pretrained("tarasophia/Bio_ClinicalBERT_medical", model_max_length=512)
    model = AutoModelForMaskedLM.from_pretrained("tarasophia/Bio_ClinicalBERT_medical")

    pipe = pipeline(
        "feature-extraction",
        model=model,
        tokenizer=tokenizer,
    )
    return pipe



def keyword_extraction(text: str, pipe: pipeline, nr_candidates: int, top_n: int) -> list[tuple]:
    kw_model = KeyBERT(model=pipe)
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words="english",
        use_maxsum=True,
        nr_candidates=nr_candidates,
        top_n=top_n,
        use_mmr=True,
        diversity=0.5,
    )
    return keywords

test_str = "I have a headache and a sore throat"
pipe = load_pipeline()
keywords_weights = keyword_extraction(test_str, pipe, 20, 10)
keywords = [keyword[0] for keyword in keywords_weights]

In [8]:
keywords

['headache sore', 'sore throat', 'sore', 'headache', 'throat']