In [1]:
import pandas as pd
from keybert import KeyBERT
from transformers import AutoTokenizer, pipeline

from src.nlp.constants import (
    MODEL_MLM_DIR,
    MODEL_TC_DIR,
    MIMIC_FINAL,
    MIMIC_PROCESSED_CLEANED_DIR,
)
import pandas as pd
from keybert import KeyBERT
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm
from tqdm.notebook import tqdm

tqdm.pandas()
# dont show warnings
import warnings

warnings.filterwarnings("ignore")
# from dask.distributed import Client

# client = Client(n_workers=4)
import dask
import dask.dataframe as dd

# run keybert on dask df




2022-11-21 20:47:48.971649: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#import sentencetransformers
from sentence_transformers import SentenceTransformer

In [5]:
def keyword_extraction(x: str, model, nr_candidates: int, top_n: int) -> list[tuple]:
    """
    This function extracts keywords from the input text.
    Parameters
    ----------
    x : str
        Input sentence.
    model : str
        Path to the model to use for keyword extraction
    Returns
    -------
    list[str]
        List of keywords. 
    """
    tokenizer = AutoTokenizer.from_pretrained(model, model_max_lenght=512)
    # Truncate all the text to 512 tokens

#model and tokenizer for sentence transformer
#force gpu              
    hf_model = pipeline(
        "feature-extraction",
        model=model,
        tokenizer=tokenizer,
    )
    #use gpu for keybert
    #device cuda means use gpu
    
    model = hf_model
    device = 6
    kw_model = KeyBERT(model=hf_model).to(device)

    keywords = kw_model.extract_keywords(
        x,
        # df["transcription"],
        keyphrase_ngram_range=(1, 2),
        stop_words="english",
        use_maxsum=True,
        nr_candidates=nr_candidates,
        top_n=top_n,
        use_mmr=True,
        diversity=0.5,
    )
    return keywords

In [6]:



def keywords_from_TC_model(df: pd.DataFrame, model: str) -> pd.DataFrame:
    """
    Extract keywords from the input text using the TC model
    Parameters
    ----------
    df : pd.DataFrame
        Dataframe with the medical transcription text to extract keywords from
    model : str
        Path to the model to use for keyword extraction
    Returns
    -------
    pd.DataFrame
        Dataframe with the keywords and weights extracted from the input text
    """
    #progress apply for dask df on keyword_extraction
    
    df["keywords_outcome_weights_TC"] = df.progress_apply(
        lambda x: keyword_extraction(
            x["TEXT_final_cleaned"], model, x["nr_candidates"], x["top_n"]
        ),
        axis=1,
    )

    df["transcription_f_TC"] = df["keywords_outcome_weights_TC"].apply(
        lambda x: [i[0] for i in x]
    )
    return df


def keywords_from_MLM_model(df: pd.DataFrame, model: str) -> pd.DataFrame:
    """
    Extract keywords from the input text using the MLM model
    Parameters
    ----------
    df : pd.DataFrame
        Dataframe with the medical transcription text to extract keywords from
    model : str
        Path to the model to use for keyword extraction
    Returns
    -------
    pd.DataFrame
        Dataframe with the keywords and weights extracted from the input text
    """
    # use                                   
    df["keywords_outcome_weights_MLM"] = df.progress_apply(
        lambda x: keyword_extraction(
            x["TEXT_final_cleaned"], model, x["nr_candidates"], x["top_n"]
        ),
        axis=1,
    )

    df["transcription_f_MLM"] = df["keywords_outcome_weights_MLM"].apply(
        lambda x: [item[0] for item in x]
    )
    return df


In [7]:


def save_dataframe(df: pd.DataFrame) -> None:
    """
    Save dataframe to csv file
    Parameters
    ----------
    df : pd.DataFrame
        This is the final dataframe with the keywords and weights to save
    """
    df.to_csv(MIMIC_FINAL, index=False)


# make df column smaller than 512
def small_column_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Make the transcription column smaller than 512 tokens
    Parameters
    ----------
    df : pd.DataFrame
        Dataframe with the medical transcription text to extract keywords from
    Returns
    -------
    pd.DataFrame
        Dataframe with the transcription column smaller than 512 tokens
    """
    df["TEXT_final_cleaned"] = df["TEXT_final_cleaned"].str[:512]
    return df


# calculate the optimal nr candidates for each individual text
def calculate_optimal_candidate_nr(text: str) -> int:
    """
    Calculate the optimal number of candidates for each text to use for
    keyword extraction
    Parameters
    ----------
    text : str
        Text to extract keywords from
    Returns
    -------
    int
        Optimal number of candidates to use for keyword extraction
    """
    text = str(text)
    nr_words = len(text.split())
    nr_candidates = int(nr_words * 20 / 100)
    if nr_candidates > 35:
        nr_candidates = 35
    return nr_candidates

def top_n_keywords(num: int) -> int:
        """
        Calculate the optimal number of keywords to extract from each text
        Parameters
        ----------
        num : int
                Number of candidates to use for keyword extraction
        Returns
        -------
        int
                Optimal number of keywords to extract from each text
        """
        if num < 10:
                num = num
        elif num > 10:
                num = round(num*0.5)
        return num



In [8]:
ddf = dd.read_csv(
                 '../../data/processed/mimic_iii/diagnoses_noteevents_cleaned.csv',
        dtype={"TEXT_final_cleaned": "str"},
        engine="python",
        error_bad_lines=False,
        warn_bad_lines=False,
        encoding="utf8",
    )


In [15]:
print(ddf.shape[0].compute())
#number of columns

62391


In [9]:
#print(ddf.shape[0].compute())
ddf = ddf.head(100)
ddf.head()

Unnamed: 0,TEXT,specialty,TEXT_final,TEXT_final_cleaned
0,Admission Date: [**2141-9-18**] ...,Cardiothoracic & Vascular,":\nhip pain\n\n:\n24yo woman with hx SLE, CKD(...",hip pain woman hx SLE CKD currently HD PD labi...
1,Admission Date: [**2187-9-19**] ...,Emergency Department,:\ns/p Motor cycle crash; left sided rib pain\...,Motor cycle crash left sided rib pain driver h...
2,Admission Date: [**2190-6-5**] Discharg...,Emergency Department,,
3,Admission Date: [**2101-4-30**] ...,Infectious Disease Specialty,:\nOSH transfer for sepsis\n\n:\n75 y/o M with...,sepsis hx type DM ESRD failed renal tx HD mont...
4,Admission Date: [**2146-9-15**] ...,Cardiothoracic & Vascular,":\nChest Pain, Abdominal Pain, Nausea/Vomiting...",Chest Pain Abdominal Pain Nausea Vomiting Righ...


In [10]:
ddf["nr_candidates"] = ddf["TEXT_final_cleaned"].apply(
        calculate_optimal_candidate_nr
    ) 
ddf["top_n"] = ddf["nr_candidates"].apply(top_n_keywords)
ddf["TEXT_final_cleaned"] = ddf["TEXT_final_cleaned"].astype(str)
ddf = small_column_df(ddf)

In [11]:
import os
#use gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_base = "emilyalsentzer/Bio_ClinicalBERT"
MODEL_TMLM_DIR = os.path.join("..", "..","models", "nlp", "maskedlanguagemodel", "model")
ddf_tc = keywords_from_TC_model(ddf, model_base)

  0%|          | 0/100 [00:00<?, ?it/s]

No sentence-transformers model found with name /Users/tara-sophiatumbraegel/.cache/torch/sentence_transformers/emilyalsentzer_Bio_ClinicalBERT. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/tara-sophiatumbraegel/.cache/torch/sentence_transformers/emilyalsentzer_Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly id

AttributeError: 'SentenceTransformer' object has no attribute 'config'

In [32]:
ddf_tc = keywords_from_TC_model(ddf, MODEL_TC_DIR)
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", local_files_only=True)

pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)(

in first function


  0%|          | 0/100 [00:00<?, ?it/s]

in second function


ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.

In [None]:
# make the dataset smaller by randomly choose 60% of the rows
                

In [None]:
def main() -> None:
    """
    Main function to run the script
    """

    ddf = dd.read_csv(  # type: ignore
        MIMIC_PROCESSED_CLEANED_DIR,
        dtype={"TEXT_final_cleaned": "str"},
        engine="python",
        error_bad_lines=False,
        warn_bad_lines=False,
        encoding="utf8",
    )
    # use only small set
    ddf = ddf.head(100)
    # ddf = ddf.repartition(npartitions=4)
    ddf["nr_candidates"] = ddf["TEXT_final_cleaned"].apply(  # type: ignore
        calculate_optimal_candidate_nr
    )  # , meta=("nr_candidates", "int")
    # )
    ddf["top_n"] = ddf["nr_candidates"].apply(
        lambda x: int(x * 0.5)
    )  # , meta=("top_n", "int")
    # )
    ddf = ddf.compute()
    ddf = small_column_df(ddf)
    ddf_tc = keywords_from_TC_model(ddf, MODEL_TC_DIR)
    ddf_mlm = keywords_from_MLM_model(ddf, MODEL_MLM_DIR)
    # concatenate  the two dataframes
    ddf_final = pd.concat([ddf_tc, ddf_mlm], axis=1)
    # save
    save_dataframe(ddf_final)


# Path: src/Keyword_Bert_Training.py
if __name__ == "__main__":
    main()

In [None]:
# reduce size of dataset randomly to 60%
                