In [1]:
import warnings
warnings.filterwarnings('ignore')

import transformers
from transformers import pipeline
import pandas as pd
import numpy as np
import torch
from transformers import BertForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

from datasets import Dataset, load_dataset
from datasets import load_metric
import pandas as pd
import csv

2024-04-26 15:18:11.874063: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
log_names = [
    "drug_disease_procedure_temperature_"
]
temperatures = [0]

In [3]:
list_of_training_data = []
for name in log_names:
    for temperature in temperatures:
        list_of_training_data.append(name+str(temperature)+".tsv")

In [4]:
inputdata_folder_path = ""
test_file_path = ""

In [5]:
# provided by supervisor and modified by student
output = []
output_test = []

for train_file in list_of_training_data:
    with open(inputdata_folder_path+train_file, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter="\t")
        output += list(reader)
        
with open(test_file_path, 'r') as csvfile:
    reader2 = csv.reader(csvfile, delimiter="\t")
    output_test = list(reader2)
     

output = [row for row in output]
output_test = [row for row in output_test]

print("len output:", len(output))
print("len output:test:", len(output_test))

len output: 43020
len output:test: 41201


In [6]:
# provided by supervisor and modified by student
total = []
temporary = []
      
for entry in output:
    if len(entry) == 2:
        text = entry[0]
        label = entry[1]
        if label == '0':
            label = 'O'
        if label in ['SMOKING', 'DISEASE', 'Smoking', 'Disease']:
            label = 'O'
        if text not in ['SEP', 'CLS']:
            temporary.append([text, label])
    if len(entry) < 2:
        total.append(temporary)
        temporary = []
        
total_test = []
temporary_test = []

for entry in output_test:
    if len(entry) == 2:
        text = entry[0]
        label = entry[1]
        if label in ['0', 'BREAST']:
            label = 'O'
        if label in ['FAMILY']:
            label = 'O'
        if text not in ['SEP', 'CLS']:
            temporary_test.append([text, label])
    if len(entry) < 2:
        total_test.append(temporary_test)
        temporary_test = []

In [7]:
# provided by supervisor
labels_ = []
labels_test_ = []

for x in total:
    for y in x:
        label_ = y[1]
        labels_.append(label_)
        
for x in total_test:
    for y in x:
        label_test = y[1]
        labels_test_.append(label_test)

In [8]:
# provided by supervisor
nplabels = np.array(labels_)
nplabels_test = np.array(labels_test_)

unique_values = np.unique(nplabels, return_counts=True)
unique_values_test = np.unique(nplabels_test, return_counts=True)

In [9]:
# provided by supervisor
counts_ = unique_values[1]
names_ = unique_values[0]

counts_test_ = unique_values_test[1]
names_test_ = unique_values_test[0]

print("train:")
for k in range(len(counts_)):
    print(names_[k], "-", counts_[k])

print()
print("test:")
for k in range(len(counts_test_)):
    print(names_test_[k], "-", counts_test_[k])

train:
DRUG - 398
O - 40819
PROCEDURE - 1305

test:
DRUG - 656
O - 39061
PROCEDURE - 1184


In [10]:
# provided by supervisor
label_list = list(names_)
label_list_test = list(names_test_)

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

label2id_test = {label: i for i, label in enumerate(label_list_test)}
id2label_test = {i: label for i, label in enumerate(label_list_test)}

print(label_list)
print(label_list_test)

['DRUG', 'O', 'PROCEDURE']
['DRUG', 'O', 'PROCEDURE']


In [11]:
# provided by supervisor and modified by student
split_85 = int(len(total) * 0.85)
validation_total = total[split_85:]
training_total = total[:split_85]
test_total = total_test

In [12]:
# provided by supervisor
from datasets import Dataset
import pandas as pd

def get_all_tokens_and_ner_tags(texts):
    return pd.concat([get_tokens_and_ner_tags(texts)]).reset_index().drop('index', axis=1)
    
def get_tokens_and_ner_tags(texts):
    all_tokens = []
    all_entities = []
    for row in texts:
        tokens = [x[0] for x in row]
        entities = [x[1] for x in row]
        
        all_tokens.append(tokens)
        all_entities.append(entities)
    return pd.DataFrame({'tokens': all_tokens, 'ner_tags': all_entities})

def create_sliding_windows(train_dataset, max_length=512, stride=512):
    windowed_dataset = []
    for data in train_dataset:
        tokens = data["tokens"]
        ner_tags = data["ner_tags"]
        for i in range(0, len(tokens), stride):
            window_tokens = tokens[i:i+max_length]
            window_ner_tags = ner_tags[i:i+max_length]
            windowed_data = {"tokens": window_tokens, "ner_tags": window_ner_tags}
            windowed_dataset.append(windowed_data)
    windowss=pd.DataFrame(windowed_dataset)
    return Dataset.from_pandas(windowss)

def get_un_token_dataset(train_directory, test_directory, validation_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    validation_df = get_all_tokens_and_ner_tags(validation_directory)
    
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    validation_dataset = Dataset.from_pandas(validation_df)
    
    # Apply sliding window on train_dataset
    train_dataset = create_sliding_windows(train_dataset)
    test_dataset = create_sliding_windows(test_dataset)
    validation_dataset = create_sliding_windows(validation_dataset)
    
    return (train_dataset, test_dataset, validation_dataset)

train_dataset, test_dataset, validation_dataset = get_un_token_dataset(training_total, test_total, validation_total)


In [13]:
# provided by supervisor
print(len(train_dataset))
print(len(test_dataset))
print(len(validation_dataset))

423
300
75


## BERT model with GPT annotations

In [14]:
# provided by supervisor and modified by student
model = BertForTokenClassification.from_pretrained('estmedBERT', num_labels=len(label_list), id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained('estmedBERT', model_max_length=512,truncation=True)

Some weights of the model checkpoint at estmedBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model 

In [15]:
# provided by supervisor
task = "ner"

def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print(label2id)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)
validation_tokenized_datasets = validation_dataset.map(tokenize_and_align_labels, batched=True)


{'DRUG': 0, 'O': 1, 'PROCEDURE': 2}


Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [16]:
# provided by supervisor
batch_size = 16

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    weight_decay=0.01
)

data_collator = DataCollatorForTokenClassification(tokenizer)


metric = load_metric("seqeval")
results_collection = []


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    

    results = metric.compute(predictions=true_predictions, references=true_labels)
    results_collection.append(results)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    


trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=validation_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('estmedbert_finetune_gpt_annot.model')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.325749,0.0,0.0,0.0,0.90828
2,No log,0.253057,0.111111,0.028169,0.044944,0.913806
3,No log,0.216144,0.128205,0.070423,0.090909,0.921854
4,No log,0.213673,0.157407,0.079812,0.105919,0.922823
5,No log,0.2036,0.169697,0.131455,0.148148,0.925829
6,No log,0.200819,0.162562,0.15493,0.158654,0.925732
7,No log,0.206159,0.152284,0.140845,0.146341,0.925247
8,No log,0.209861,0.158163,0.14554,0.151589,0.924569


In [17]:
# provided by supervisor
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list_test[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'ROCEDURE': {'precision': 0.11688311688311688,
  'recall': 0.04770318021201413,
  'f1': 0.06775407779171895,
  'number': 566},
 'RUG': {'precision': 0.1837837837837838,
  'recall': 0.19767441860465115,
  'f1': 0.19047619047619047,
  'number': 344},
 'overall_precision': 0.15806988352745424,
 'overall_recall': 0.1043956043956044,
 'overall_f1': 0.1257445400397088,
 'overall_accuracy': 0.9562650585433897}