# imports

In [1]:
import sys
import os

import pandas as pd
import numpy as np


# Getting the environment where this notebook is running
if 'KAGGLE_URL_BASE' in os.environ:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("HUGGINGFACE_API_KEY")
elif 'google.colab' in sys.modules:
    !pip -q install python-dotenv
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

else:
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [2]:
!pip install -q --upgrade datasets

!pip install -q transformers evaluate seqeval

from datasets import load_dataset

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.1 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0

# HuggingFace Login

In [3]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token=HUGGINGFACE_API_KEY, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading dataset

In [4]:
ner_data = load_dataset("SKT27182/NER_processed_data")

Downloading readme:   0%|          | 0.00/627 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/569k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15766 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3943 [00:00<?, ? examples/s]

In [5]:
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags'],
        num_rows: 15766
    })
    test: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags'],
        num_rows: 3943
    })
})

# Tokenizer

- Using distilber for fine-tuning

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

- After tokenizing there comes few extra tokens, plus few words get splitted to one or more sub-words. So tokenizing them as -100, So it will be ignored in loss.

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
tokenized_ner = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/15766 [00:00<?, ? examples/s]

Map:   0%|          | 0/3943 [00:00<?, ? examples/s]

In [9]:
tokenized_ner

DatasetDict({
    train: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15766
    })
    test: Dataset({
        features: ['id', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3943
    })
})

## Model

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [12]:
label_list = ["O", "treatment", "chronic_disease", "cancer", "allergy_name"]

In [13]:
example = ner_data["train"][0]
labels = [label_list[int(i)] for i in example[f"ner_tags"]]
labels

['O', 'O', 'O', 'cancer', 'cancer']

In [14]:
import numpy as np

# labels = [label_list[int(i)] for i in ner_[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[int(p)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[int(l)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [15]:
# compute_metrics(())

# Fine-Tuning

In [16]:
id2label = {
    0.0: "O",
    1.0: "treatement",
    2.0: "chronic_disease",
    3.0: "cancer",
    4.0: "allergy_name",
}
label2id = {
    "O": 0.0,
    "treatement": 1.0,
    "chronic_disease": 2.0,
    "cancer": 3.0,
    "allergy_name": 4.0,
}

In [17]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Trainer

In [18]:
def continual_training(model, train_data, test_data, output_dir, push_to_hub=True):
    
    training_args = TrainingArguments(
    output_dir=f"Name_Entity_Recognizer_model{output_dir}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=push_to_hub,
)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    

    trainer.train()
    
    return trainer.model
    


## Task-1

In [19]:
part_one_data = tokenized_ner.filter(lambda example: example['dataset_num'] == 1)
tuned_model1 = continual_training(model, part_one_data["train"], part_one_data["test"], output_dir=1)

Filter:   0%|          | 0/15766 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3943 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.249577,0.644429,0.693924,0.668261,0.917317
2,0.301800,0.233202,0.665049,0.7329,0.697328,0.923742


  _warn_prf(average, modifier, msg_start, len(result))


## Task-2

In [22]:
from datasets import Dataset, concatenate_datasets

part_two_data = tokenized_ner.filter(lambda example: example['dataset_num'] == 2)

# Include only 100 examples from dataset_num=1
part_one_data_subset = part_one_data["train"][:100]

# Convert the dictionary to a dataset
part_one_data_subset = Dataset.from_dict(part_one_data_subset)

# Concatenate the examples from dataset_num=2 and the subset from dataset_num=1
part_two_data_train = concatenate_datasets([part_two_data["train"], part_one_data_subset])

# Continue training the model with the combined dataset
tuned_model2 = continual_training(tuned_model1, 
                                  part_two_data_train, 
                                  concatenate_datasets([part_two_data["test"], part_one_data["test"]]), 
                                  output_dir=2)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.222619,0.683535,0.734871,0.708274,0.924117
2,0.243600,0.22095,0.67974,0.741376,0.709221,0.924853




## Task-3

In [23]:
part_three_data = tokenized_ner.filter(lambda example: example['dataset_num'] == 3)

# Include only 100 examples from dataset_num=1,2
part_one_data_subset = part_one_data["train"][:50]
part_two_data_subset = part_two_data["train"][:50]

# Convert the dictionary to a dataset
part_one_data_subset = Dataset.from_dict(part_one_data_subset)
part_two_data_subset = Dataset.from_dict(part_two_data_subset)

# Concatenate the examples from dataset_num=2 and the subset from dataset_num=1
part_three_data_train = concatenate_datasets([part_three_data["train"], part_one_data_subset, part_two_data_subset])

# Continue training the model with the combined dataset
tuned_model3 = continual_training(tuned_model2, 
                                  part_three_data_train, 
                                  concatenate_datasets([part_three_data["test"], part_two_data["test"], part_one_data["test"]]),
                                 output_dir=3)


Filter:   0%|          | 0/15766 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3943 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.208426,0.693394,0.747595,0.719475,0.926649
2,0.228000,0.212323,0.690033,0.757616,0.722247,0.926519




# Task-4 (Combined dataset)

In [24]:
combined_training = continual_training(model, 
                                       tokenized_ner["train"], 
                                       tokenized_ner["test"], 
                                       push_to_hub=True,
                                      output_dir="")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2087,0.205802,0.708297,0.767638,0.736775,0.930662
2,0.1547,0.205588,0.717994,0.765099,0.740798,0.931151




## Loading Fine_Tuned Model

In [27]:
from transformers import pipeline

classifier = pipeline("ner", model="SKT27182/Name_Entity_Recognizer")

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [28]:
classifier("ust have the diagnosis of NB in accordance with the International Criteria, i.e., either histopathology (confirmed by the MSKCC Department of Pathology) or BM involvement plus elevated urinary catecholamines")

[{'entity': 'chronic_disease',
  'score': 0.67050254,
  'index': 7,
  'word': 'n',
  'start': 26,
  'end': 27},
 {'entity': 'chronic_disease',
  'score': 0.7304839,
  'index': 8,
  'word': '##b',
  'start': 27,
  'end': 28},
 {'entity': 'chronic_disease',
  'score': 0.7906186,
  'index': 43,
  'word': 'ur',
  'start': 185,
  'end': 187},
 {'entity': 'chronic_disease',
  'score': 0.7500598,
  'index': 44,
  'word': '##ina',
  'start': 187,
  'end': 190},
 {'entity': 'chronic_disease',
  'score': 0.8246452,
  'index': 45,
  'word': '##ry',
  'start': 190,
  'end': 192},
 {'entity': 'chronic_disease',
  'score': 0.7856513,
  'index': 46,
  'word': 'cat',
  'start': 193,
  'end': 196},
 {'entity': 'chronic_disease',
  'score': 0.61697036,
  'index': 47,
  'word': '##ech',
  'start': 196,
  'end': 199},
 {'entity': 'chronic_disease',
  'score': 0.5686747,
  'index': 48,
  'word': '##ola',
  'start': 199,
  'end': 202},
 {'entity': 'chronic_disease',
  'score': 0.5305861,
  'index': 49,
  'w

- Above it is assigning entity to each of the tokenized word, which may not be in the exact form as that in the inout text.