# imports

In [1]:
import sys
import os

os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np


# Getting the environment where this notebook is running
if 'KAGGLE_URL_BASE' in os.environ:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("HUGGINGFACE_API_KEY")
elif 'google.colab' in sys.modules:
    !pip -q install python-dotenv
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

else:
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [2]:
!pip uninstall datasets --yes
!pip install datasets==2.16.1

Found existing installation: datasets 2.1.0
Uninstalling datasets-2.1.0:
  Successfully uninstalled datasets-2.1.0
Collecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.1)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempti

In [3]:
# !pip install -q --upgrade datasets

!pip install -q transformers #evaluate seqeval

In [4]:
from datasets import load_dataset

# HuggingFace Login

In [5]:
from huggingface_hub import notebook_login
from huggingface_hub import login

login(token=HUGGINGFACE_API_KEY, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful



# Loading dataset

In [None]:
ner_data = load_dataset("SKT27182/NER_processed_data")

In [7]:
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags'],
        num_rows: 15766
    })
    test: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags'],
        num_rows: 3943
    })
})

# Tokenizer

- Fine-tuning distilbert-base-uncased for NER

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

- After tokenizing there comes few extra tokens, plus few words get splitted to one or more sub-words. So tokenizing them as -100, So it will be ignored in loss.

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(
            batch_index=i
        )  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif (
                word_idx != previous_word_idx
            ):  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_ner = ner_data.map(tokenize_and_align_labels, batched=True)

In [11]:
tokenized_ner

DatasetDict({
    train: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15766
    })
    test: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3943
    })
})

## Model

In [12]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

2024-02-18 10:14:32.831650: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-18 10:14:32.831752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-18 10:14:32.952721: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
# import evaluate

# seqeval = evaluate.load("seqeval")

In [14]:
import numpy as np


def f1_score_numpy(true_labels, predicted_labels):
    # Flatten the arrays to handle multi-dimensional inputs
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    true_labels_flat = true_labels.flatten()
    predicted_labels_flat = predicted_labels.flatten()

    # ignoring -100 which is for sub-words and 0 which labels the all other categories
    ignore_mask = (
        (true_labels_flat != -100)
        & (predicted_labels_flat != -100)
        & (true_labels_flat != 0)
        & (predicted_labels_flat != 0)
    )
    true_labels_flat = true_labels_flat[ignore_mask]
    predicted_labels_flat = predicted_labels_flat[ignore_mask]

    # Calculate true positives, false positives, and false negatives for each class
    classes = np.unique(np.concatenate((true_labels_flat, predicted_labels_flat)))

    true_positives = np.zeros(len(classes))
    false_positives = np.zeros(len(classes))
    false_negatives = np.zeros(len(classes))

    for i, cls in enumerate(classes):
        true_positives[i] = np.sum(
            (true_labels_flat == cls) & (predicted_labels_flat == cls)
        )
        false_positives[i] = np.sum(
            (true_labels_flat != cls) & (predicted_labels_flat == cls)
        )
        false_negatives[i] = np.sum(
            (true_labels_flat == cls) & (predicted_labels_flat != cls)
        )
    # Calculate precision, recall, and F1 score for each class
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    # Avoid division by zero for precision and recall
    precision = np.nan_to_num(precision)
    recall = np.nan_to_num(recall)

    # Calculate F1 score for each class
    f1 = 2 * (precision * recall) / (precision + recall)

    f1 = np.nan_to_num(f1)

    #     Calculate weighted average F1 score
    f1_weighted = np.average(
        f1, weights=np.sum(true_labels_flat.reshape(-1, 1) == classes, axis=0)
    )

    # Calculate weighted average precision
    precision_weighted = np.average(
        precision, weights=np.sum(true_labels_flat.reshape(-1, 1) == classes, axis=0)
    )

    # Calculate weighted average recall
    recall_weighted = np.average(
        recall, weights=np.sum(true_labels_flat.reshape(-1, 1) == classes, axis=0)
    )

    return f1_weighted, precision_weighted, recall_weighted

In [15]:
label_list = ["O", "treatment", "chronic_disease", "cancer", "allergy_name"]

In [16]:
label_list

['O', 'treatment', 'chronic_disease', 'cancer', 'allergy_name']

In [17]:
example = ner_data["train"][0]
labels = [label_list[int(i)] for i in example[f"ner_tags"]]
labels

['O', 'O', 'O', 'cancer', 'cancer']

In [18]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis=2)

    #     true_predictions = [
    #         [label_list[int(p)] for (p, l) in zip(prediction, label) if l != -100]
    #         for prediction, label in zip(predictions, labels)
    #     ]

    #     true_labels = [
    #         [label_list[int(l)] for (p, l) in zip(prediction, label) if l != -100]
    #         for prediction, label in zip(predictions, labels)
    #     ]
    #     print(true_predictions)
    # #     print(true_predictions)
    #     results = seqeval.compute(predictions=true_predictions, references=true_labels)

    #     print(results.keys())
    f1, precision, recall = f1_score_numpy(labels, predictions)
    return {
        "weighted: f1 score": f1,
        "weighted: precision score": precision,
        "weighted: recall score": recall,
    }

# Fine-Tuning

In [19]:
id2label = {
    0.0: "O",
    1.0: "treatement",
    2.0: "chronic_disease",
    3.0: "cancer",
    4.0: "allergy_name",
}
label2id = {
    "O": 0.0,
    "treatement": 1.0,
    "chronic_disease": 2.0,
    "cancer": 3.0,
    "allergy_name": 4.0,
}

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

## Trainer

In [21]:
def continual_training(model, train_data, test_data, push_to_hub=True):

    training_args = TrainingArguments(
        output_dir=f"Name_Entity_Recognizer_model",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=push_to_hub,
        report_to=None,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer.model

## Task-1

In [22]:
part_one_data = tokenized_ner.filter(lambda example: example["dataset_num"] == 1)
tuned_model1 = continual_training(
    model, part_one_data["train"], part_one_data["test"], push_to_hub=True
)

Filter:   0%|          | 0/15766 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3943 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Weighted: f1 score,Weighted: precision score,Weighted: recall score
1,No log,0.240756,0.952695,0.955295,0.956541
2,0.299800,0.229832,0.961463,0.962588,0.962971


## Task-2

In [23]:
from datasets import Dataset, concatenate_datasets

part_two_data = tokenized_ner.filter(lambda example: example["dataset_num"] == 2)

# Include only 100 examples from dataset_num=1
part_one_data_subset = part_one_data["train"][:100]

# Convert the dictionary to a dataset
part_one_data_subset = Dataset.from_dict(part_one_data_subset)

# Concatenate the examples from dataset_num=2 and the subset from dataset_num=1
part_two_data_train = concatenate_datasets(
    [part_two_data["train"], part_one_data_subset]
)

# Continue training the model with the combined dataset
tuned_model2 = continual_training(
    tuned_model1,
    part_two_data_train,
    concatenate_datasets([part_two_data["test"], part_one_data["test"]]),
)

Filter:   0%|          | 0/15766 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3943 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Weighted: f1 score,Weighted: precision score,Weighted: recall score
1,No log,0.224129,0.961874,0.962457,0.962503
2,0.241900,0.221282,0.961732,0.962024,0.962143


## Task-3

In [24]:
part_three_data = tokenized_ner.filter(lambda example: example["dataset_num"] == 3)

# Include only 100 examples from dataset_num=1,2
part_one_data_subset = part_one_data["train"][:50]
part_two_data_subset = part_two_data["train"][:50]

# Convert the dictionary to a dataset
part_one_data_subset = Dataset.from_dict(part_one_data_subset)
part_two_data_subset = Dataset.from_dict(part_two_data_subset)

# Concatenate the examples from dataset_num=2 and the subset from dataset_num=1
part_three_data_train = concatenate_datasets(
    [part_three_data["train"], part_one_data_subset, part_two_data_subset]
)

# Continue training the model with the combined dataset
tuned_model3 = continual_training(
    tuned_model2,
    part_three_data_train,
    concatenate_datasets(
        [part_three_data["test"], part_two_data["test"], part_one_data["test"]]
    ),
)

Filter:   0%|          | 0/15766 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3943 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Weighted: f1 score,Weighted: precision score,Weighted: recall score
1,No log,0.209861,0.967848,0.968175,0.967976
2,0.227900,0.209976,0.968554,0.968709,0.968656


# Task-4 (Combined dataset)

In [25]:
combined_training = continual_training(
    model, tokenized_ner["train"], tokenized_ner["test"], push_to_hub=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Weighted: f1 score,Weighted: precision score,Weighted: recall score
1,0.1984,0.203975,0.971206,0.971306,0.971314
2,0.1496,0.205265,0.971144,0.971251,0.971214


## Loading Fine_Tuned Model

In [None]:
from transformers import pipeline

classifier = pipeline(
    "ner", model="SKT27182/Name_Entity_Recognizer", aggregation_strategy="simple"
)

In [27]:
classifier(
    "Just have the diagnosis of NB in accordance with the International Criteria, i.e., either histopathology (confirmed by the MSKCC Department of Pathology) or BM involvement plus elevated urinary catecholamines"
)

[{'entity_group': 'chronic_disease',
  'score': 0.7396828,
  'word': 'nb',
  'start': 27,
  'end': 29},
 {'entity_group': 'chronic_disease',
  'score': 0.6675972,
  'word': 'urinary cat',
  'start': 186,
  'end': 197},
 {'entity_group': 'chronic_disease',
  'score': 0.3987303,
  'word': '##ola',
  'start': 200,
  'end': 203}]