# unsupervised

In [26]:
# imports
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer
import os

#from constants import *
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    AutoModelForMaskedLM,
)

In [27]:
# constants
# RAW DATA NLP
NLP_RAW_PATH_DIR = os.path.join("data", "raw", "nlp")
MTSAMPLES_RAW_PATH_DIR = os.path.join(NLP_RAW_PATH_DIR, "mtsamples")
PATIENT_NOTES_RAW_PATH_DIR = os.path.join(NLP_RAW_PATH_DIR, "patient_notes")

# PROCESSED DATA NLP
NLP_PROCESSED_PATH_DIR = os.path.join("data", "processed", "nlp")
MTSAMPLES_PROCESSED_PATH_DIR = os.path.join(NLP_PROCESSED_PATH_DIR, "mtsamples")
PATIENT_NOTES_PROCESSED_PATH_DIR = os.path.join(NLP_PROCESSED_PATH_DIR, "patient_notes")

# MODEL_SEMI_SUPERVISED
MODEL_UNSUPERVISED_NAME = "emilyalsentzer/Bio_ClinicalBERT"
MODEL_UNSUPERVISED_CHECKPOINTS_DIR = os.path.join(
    "models", "nlp", "unsupervised", "checkpoints"
)
MODEL_UNSUPERVISED_MODEL_DIR = os.path.join("models", "nlp", "unsupervised", "model")

# MODEL_UNSUPERVISED


# configs
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

In [30]:
def load_datasets(data_path):

    dtf_mlm = pd.read_csv(data_path)
    dtf_mlm = dtf_mlm.head(100)
    # Train/Valid Split
    df_train, df_valid = train_test_split(
        dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
    )
    # Convert to Dataset object
    dataset_train = Dataset.from_pandas(df_train[["transcription"]].dropna())
    dataset_val = Dataset.from_pandas(df_valid[["transcription"]].dropna())
    return dataset_train, dataset_val

In [28]:
def load_tokenizer():
    # "bert-base-cased"
    TokenizerClass = BertTokenizer
    bert_type = "bert-base-cased"

    tokenizer = TokenizerClass.from_pretrained(
        bert_type,
        model_max_length=512,  # MAX_SEQ_LEN
        truncate=True,
        max_length=512,
        padding=True,
    )  # autotokenizer

    return tokenizer

def tokenize_function(batch, tokenizer):  # before row
    return tokenizer(
        batch["transcription"],
        padding="max_length",
        truncation=True,
        max_length=512,  # MAX_SEQ_LEN,
        return_special_tokens_mask=True,
    )

def tokenize_dataset(dataset, tokenizer):
    column_names = dataset.column_names

    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        num_proc=multiprocessing.cpu_count(),
        remove_columns=column_names,
        fn_kwargs={"tokenizer": tokenizer}  # ,
        # batched=True,
    )
    return tokenized_datasets

In [31]:
train_ds, val_ds = load_datasets(
        os.path.join(MTSAMPLES_PROCESSED_PATH_DIR, "mtsamples_cleaned.csv")
    )

tokenizer = load_tokenizer()
tokenized_train_ds = tokenize_dataset(train_ds, tokenizer)
tokenized_val_ds = tokenize_dataset(val_ds, tokenizer)

def get_device() -> torch.device:
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = get_device()

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:

#import metrics
import wandb
#initalize wandb also print in notebook
wandb.init(project="nlp", entity="michael", name="test", reinit=True)

#from transformers import metric
from datasets import metric, load_metric

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def load_training_args(output_dir):
    # steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=30,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        warmup_steps=LR_WARMUP_STEPS,
        # save_steps=steps_per_epoch,
        save_total_limit=3,
        weight_decay=WEIGHT_DECAY,
        learning_rate=LEARNING_RATE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        seed=SEED_TRAIN,
        report_to="wandb",
    )

    return training_args


def load_trainer(model, training_args, train_ds, val_ds, tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,  # masks the tokens
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    return trainer


def load_model(device):
    ModelClass = BertForMaskedLM
    bert_type = "bert-base-cased"
    model = ModelClass.from_pretrained(bert_type).to(device)
    return model


model = load_model(device)
training_args = load_training_args(MODEL_UNSUPERVISED_CHECKPOINTS_DIR)
trainer = load_trainer(
        model,
        training_args,
        tokenized_train_ds,
        tokenized_val_ds,
        tokenizer, compute_metrics
)

# last_checkpoint = get_last_checkpoint(training_args.output_dir)
# if last_checkpoint is None:
#     resume_from_checkpoint = None
# else:
#     resume_from_checkpoint = True

trainer.train()#resume_from_checkpoint=resume_from_checkpoint)

    # evaluate
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    #trainer.save_model(MODEL_UNSUPERVISED_MODEL_DIR)
    # torch.cuda.empty_cache()
    #trainer.save_state()



# Keybert

In [1]:
text = 'this text is a test, i am sick my dog is sick'

In [2]:
from keybert import KeyBERT
keybert = KeyBERT('models/nlp/semi_supervised/model')
keywords = keybert.extract_keywords(
    text,
    keyphrase_ngram_range=(1, 2),
    stop_words="english",
    use_maxsum=True,
    nr_candidates=10,
    top_n=5,
    use_mmr=True,
)


No sentence-transformers model found with name models/nlp/semi_supervised/model. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at models/nlp/semi_supervised/model were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# import transformers
import transformers
from transformers import pipeline
hf_model = pipeline('feature-extraction', model='models/nlp/semi_supervised/model')
kw_model = KeyBERT(model=hf_model)
keywords = kw_model.extract_keywords(
    text,
    keyphrase_ngram_range=(1, 2),
    stop_words="english",
    use_maxsum=True,
    nr_candidates=10,
    top_n=5,
    use_mmr=True,
)
keywords

Some weights of the model checkpoint at models/nlp/semi_supervised/model were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('test sick', 0.6507),
 ('sick dog', 0.6467),
 ('sick', 0.5757),
 ('text test', 0.5491),
 ('text', 0.3682)]

In [3]:
##imports
import os
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset, load_metric, Dataset
# from src/nlp/constants import (
#     MTSAMPLES_PROCESSED_PATH_DIR,
#     MODEL_SEMI_SUPERVISED_NAME,
#     MODEL_SEMI_SUPERVISED_CHECKPOINTS_DIR,
#     MODEL_SEMI_SUPERVISED_MODEL_DIR,
# )
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from transformers.trainer_utils import get_last_checkpoint
import wandb

#wandb.init(project="nlp", entity="nlp_masterthesis")



## sentence classification model

In [None]:
# map medical specialty to labels
def map_medical_specialty_to_labels(path):
    df = pd.read_csv(path)
    dict_medical_specialty = {
        value: idx for idx, value in enumerate(df.medical_specialty.unique())
    }
    df["labels"] = df.medical_specialty.map(dict_medical_specialty)
    return df


def load_datasets(data_path):
    dataset = Dataset.from_pandas(map_medical_specialty_to_labels(data_path))
    dataset_train_test = dataset.train_test_split(test_size=0.1)
    # train dataset
    dataset_train_val = dataset_train_test["train"].train_test_split(test_size=0.1)
    dataset_train = dataset_train_val["train"]
    # validation dataset
    dataset_val = dataset_train_val["test"]

    return dataset_train, dataset_val


def tokenize_function(batch, tokenizer):
    return tokenizer(
        batch["transcription"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )


def tokenize_dataset(dataset, tokenizer):
    tokenized_datasets = dataset.map(
        tokenize_function,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
    )
    return tokenized_datasets


def clean_remove_column(tokenized_dataset):
    tokenized_dataset = tokenized_dataset.remove_columns(
        [
            "Unnamed: 0",
            "description",
            "medical_specialty",
            "sample_name",
            "transcription",
            "keywords",
            "keywords_list",
            "location",
        ]
    )
    # tokenized_dataset = tokenized_dataset.rename_column("labels_val", "labels")
    tokenized_dataset.set_format("torch")
    return tokenized_dataset


def compute_metrics(eval_pred):
    metric = load_metric("accuracy", average="macro")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


def get_device() -> torch.device:
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_model(device):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_SEMI_SUPERVISED_NAME, num_labels=39
    ).to(device)

    return model


def load_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_SEMI_SUPERVISED_NAME, model_max_length=512
                                              truncate=True, max_length=512, padding=True)
                                        
    return tokenizer


def load_training_args(output_dir):
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        # logging_dir="./logs",
        # logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="wandb",
    )
    return training_args


def load_trainer(model, training_args, train_ds, val_ds, tokenizer):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # callbacks=[EarlyStoppingCallback()],
    )
    return trainer


def main():
    train_ds, val_ds = load_datasets(
        os.path.join(MTSAMPLES_PROCESSED_PATH_DIR, "mtsamples_cleaned.csv")
    )

    tokenizer = load_tokenizer()
    tokenized_train_ds = tokenize_dataset(train_ds, tokenizer)
    tokenized_val_ds = tokenize_dataset(val_ds, tokenizer)

    tokenized_train_ds = clean_remove_column(tokenized_train_ds)
    tokenized_val_ds = clean_remove_column(tokenized_val_ds)

    device = get_device()
    model = load_model(device)
    training_args = load_training_args(MODEL_SEMI_SUPERVISED_CHECKPOINTS_DIR)
    trainer = load_trainer(
        model,
        training_args,
        tokenized_train_ds,
        tokenized_val_ds,
        tokenizer,
    )

    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None:
        resume_from_checkpoint = None
    else:
        resume_from_checkpoint = True

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    trainer.save_model(MODEL_SEMI_SUPERVISED_MODEL_DIR)
    trainer.save_state()


# if __name__ == "__main__":
#     main()


In [6]:
# load the trained model and tokenizer and test it on a sample text
def load_model_and_tokenizer():
    model = AutoModelForSequenceClassification.from_pretrained('models/nlp/semi_supervised/model')
    #     MODEL_SEMI_SUPERVISED_MODEL_DIR
    # )
    tokenizer = AutoTokenizer.from_pretrained('models/nlp/semi_supervised/model')
    return model, tokenizer

#test the model on a sample text
def test_model(model, tokenizer):
    text = "I have a headache"
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_idx = logits.argmax(-1)
    print(predicted_class_idx)
    
model, tokenizer = load_model_and_tokenizer()
test_model(model, tokenizer)

tensor([22])


## test without keybert

In [None]:
sentence = "HuggingFace is a company based in New York,  he is sick , headach but is also has employees working in Paris"
#tokenizer = config.TOKENIZER,config.MAX_LEN
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
tokenized = tokenizer.encode_plus(sentence, max_length=512, pad_to_max_length=True,return_offsets_mapping=True, return_token_type_ids=True) #return_tensors="pt"
input_ids = torch.tensor([tokenized["input_ids"]]).to(DEVICE)
attention_mask = torch.tensor([tokenized["attention_mask"]]).to(DEVICE)
token_type_ids = torch.tensor([tokenized["token_type_ids"]]).to(DEVICE)
offsets = torch.tensor([tokenized["offset_mapping"]]).to(DEVICE)
print(offsets)

model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True

# #similarly this can be done for all 5 models
model = NBMEModel(conf=model_config)
model.load_state_dict(torch.load("../models/model_fold1.bin",  map_location=torch.device('cpu')))

model.to(DEVICE)
  
with torch.no_grad():
    logits = model(ids=input_ids, mask=attention_mask, token_type_ids=token_type_ids) #last_hidden_state

def get_predictions(logits):
    preds = torch.sigmoid(logits).cpu().detach().numpy()
    preds = np.where(preds > 0.5, 1, 0)
    return preds

def get_prediction_locations(preds, offsets):
    locations = []
    for pred, offset in zip(preds, offsets):
        for i in range(len(pred)):
            if pred[i] == 1:
                locations.append(offset[i])
    return locations

def get_prediction_keywords(preds, offsets, sentence):
    keywords = []
    for pred, offset in zip(preds, offsets):
        #print(len(pred))
        for i in range(len(pred)):
            #print(pred[i])
            if pred[i] == 1:
                #print('yes')
                keywords.append(sentence[offset[i][0]:offset[i][1]])
    return keywords


preds = get_predictions(logits)
#offsets = tokenized["offset_mapping"]
keyword = get_prediction_keywords(preds, offsets, sentence)

def get_labels(preds):
    labels = []
    for pred in preds:
        labels.append(np.where(pred == 1)[0])
    return labels


labels = get_labels(preds)
print(sentence)
print(keyword)
print(labels)

## Keyword Extraction

In [None]:
#imports
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords

# nltk.download("stopwords")
# nltk.download("punkt")

# nltk.download("wordnet")
# nltk.download("omw-1.4")
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import transformers
from transformers import pipeline


def KeywordExtraction(text):
    tokenizer = AutoTokenizer.from_pretrained(
        "models/nlp/semi_supervised/model", model_max_lenght=512
    )

    # truncate all the text to 512 tokens

    hf_model = pipeline(
        "feature-extraction",
        model="models/nlp/semi_supervised/model",
        tokenizer=tokenizer,  # "models/nlp/semi_supervised/model",
    )

    kw_model = KeyBERT(model=hf_model)
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words="english",
        use_maxsum=True,
        nr_candidates=20,
        top_n=15,
        use_mmr=True,
        diversity=0.5,
    )
    return keywords


def apply_keyword_on_Dataframe(df):
    df["keywords_outcome_weights"] = df["transcription"].apply(
        lambda x: KeywordExtraction(x)
    )

In [17]:
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords

# nltk.download("stopwords")
# nltk.download("punkt")

# nltk.download("wordnet")
# nltk.download("omw-1.4")
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import transformers
from transformers import pipeline


from keyphrase_vectorizers import KeyphraseCountVectorizer

#kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer())


In [None]:
#fine tune sentence transformer model
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
import pandas as pd

#load the model
model = SentenceTransformer('models/nlp/semi_supervised/model')
                

In [23]:
def KeywordExtraction(text):
    tokenizer = AutoTokenizer.from_pretrained(
        "models/nlp/semi_supervised/model", model_max_lenght=512
    )

    # truncate all the text to 512 tokens

    hf_model = pipeline(
        "feature-extraction",
        model="models/nlp/semi_supervised/model",
        tokenizer=tokenizer, # truncation# "models/nlp/semi_supervised/model",
    )

    kw_model = KeyBERT(model=hf_model)
    keywords = kw_model.extract_keywords(
        text,#vectorizer=KeyphraseCountVectorizer(), 
        keyphrase_ngram_range=(1, 2),
        stop_words="english",
        use_maxsum=True,
        nr_candidates=20,
        top_n=15,
        use_mmr=True,
        diversity=0.5,
    )
    return keywords




# make df column smaller than 512
def small_column_df(df):
    df = df[df["transcription"].str.len() < 512]
    df = df.head(20)
    return df




def apply_keyword_on_Dataframe(df):
    df["keywords_outcome_weights"] = df["transcription"].apply(
        lambda x: KeywordExtraction(x)
    )
    


In [24]:
df_1 = pd.read_csv("data/processed/nlp/mtsamples/mtsamples_cleaned.csv")
df = small_column_df(df_1)


In [21]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,keywords_list,location
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mmode leave atrial enlargement leave atrial di...,"cardiovascular / pulmonary, 2-d m-mode, dopple...","['cardiovascular / pulmonary', ' 2-d m-mode', ...","dict_values([[221, 233], [11, 29], [163, 181],..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,description normal cardiac chamber size normal...,"cardiovascular / pulmonary, ejection fraction,...","['cardiovascular / pulmonary', ' ejection frac...","dict_values([[97, 114], [76, 96], [282, 295], ..."


In [25]:
df_f = apply_keyword_on_Dataframe(df)
df_f.head(2)

Some weights of the model checkpoint at models/nlp/semi_supervised/model were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [None]:
#https://maartengr.github.io/KeyBERT/guides/countvectorizer.html#custom-backend