In [None]:
!pip install -U accelerate
!pip install -U transformers seqeval[gpu]

In [17]:
!pip install datasets evaluate seqeval[gpu]

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
from pathlib import Path
import re

def read_data(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    #print(raw_text[:100])
    raw_docs = re.split(r'\n', raw_text)
    print(raw_docs[:100])
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            if len(line) < 3:
              continue
            token, tag, sentence= line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_data('output.tsv')

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
from sklearn.model_selection import train_test_split

train_text,val_text, train_tags,  val_tags = train_test_split(texts,tags, test_size=0.2, random_state=42)

In [None]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [16]:
label_list = list(unique_tags)

In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-uncased')

#train_texts = train_data['Token'].tolist()
#val_texts = val_data['Token'].tolist()


train_encodings = tokenizer(train_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [None]:
import numpy as np

def encode_tags(tags, encodings):
  print(tags[:10])
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

#train_tags = train_data['Tag'].tolist()
#val_tags = val_data['Tag'].tolist()

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

class MEDOCCANDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = MEDOCCANDataset(train_encodings, train_labels)
val_dataset = MEDOCCANDataset(val_encodings, val_labels)

In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(unique_tags))
model = model.to(device)

In [None]:
from sklearn.metrics import classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten the predictions and labels for sklearn's classification_report
    labels_flat = [label for sublist in labels for label in sublist]
    preds_flat = [pred for sublist in preds for pred in sublist]

    # Generate classification report
    classification_rep = classification_report(labels_flat, preds_flat, output_dict=True)

    # Calculate overall metrics
    overall_metrics = {
        "accuracy": classification_rep['accuracy'],
        "precision": classification_rep['macro avg']['precision'],
        "recall": classification_rep['macro avg']['recall'],
        "f1_score": classification_rep['macro avg']['f1-score']
    }

    return {
        "overall": overall_metrics
    }


In [18]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup

# Defining test dataset
test_texts, test_tags = read_data('test.tsv')
test_encodings = tokenizer(test_texts, is_split_into_words=True,
                          return_offsets_mapping=True, padding=True,
                           truncation=True)
test_labels = encode_tags(test_tags, test_encodings)
test_dataset = MEDOCCANDataset(test_encodings, test_labels)  # Replace with your actual test dataset

# Initialize an empty list to store predictions
all_test_predictions = []

k = 3  # Number of folds
kf = KFold(n_splits=k, shuffle=True)

# Initialize your model, tokenizer, datasets, and other components

for fold, (train_index, eval_index) in enumerate(kf.split(train_encodings)):
    print(f"Training fold {fold + 1}/{k}")

    # Split data into train and eval for this fold
    train_inputs_fold = [train_encodings[i] for i in train_index]
    train_labels_fold = [train_labels[i] for i in train_index]
    eval_inputs_fold = [val_encodings[i] for i in eval_index]
    eval_labels_fold = [val_labels[i] for i in eval_index]

    # Instantiate your Trainer and TrainingArguments for this fold
    training_args_fold = TrainingArguments(
        output_dir=f'./results_fold_{fold}',  # Directory for results
        num_train_epochs=3,                  # Total number of training epochs
        per_device_train_batch_size=12,       # Batch size per GPU
        logging_dir=f'./logs_fold_{fold}',    # Directory for storing logs
        save_strategy = "epoch",                       # Save model checkpoint every 500 steps
        evaluation_strategy="epoch",          # Evaluate at the end of each epoch
        logging_steps=100,                    # Log metrics every 100 steps
        learning_rate=3e-5,                   # Learning rate
        gradient_accumulation_steps=1,        # Number of updates steps before backward pass
        weight_decay=0.0,                     # Weight decay (if applicable)
        adam_beta1=0.9,                       # AdamW beta1
        adam_beta2=0.999,                     # AdamW beta2
        adam_epsilon=1e-8,                    # AdamW epsilon
        max_grad_norm=1.0,                    # Gradient clipping threshold
        warmup_steps=500,                     # Number of warmup steps for the scheduler
        load_best_model_at_end=True,          # Load the best model when training ends
        metric_for_best_model='eval_loss',    # Metric to use to determine the best model
        greater_is_better=False               # Indicate if higher metric values are better
    )

    # Instantiate AdamW optimizer and scheduler for this fold
    optimizer_fold = AdamW(model.parameters(), lr=training_args_fold.learning_rate,
                           betas=(training_args_fold.adam_beta1, training_args_fold.adam_beta2),
                           eps=training_args_fold.adam_epsilon)
    num_training_steps_fold = len(train_inputs_fold) // (training_args_fold.per_device_train_batch_size *
                                                        training_args_fold.gradient_accumulation_steps) * training_args_fold.num_train_epochs
    scheduler_fold = get_linear_schedule_with_warmup(optimizer_fold, num_warmup_steps=training_args_fold.warmup_steps,
                                                    num_training_steps=num_training_steps_fold)

    # Initialize Trainer for this fold
    trainer_fold = Trainer(
        model=model,
        args=training_args_fold,
        optimizers=(optimizer_fold,scheduler_fold),
        train_dataset= MEDOCCANDataset(train_encodings, train_labels),
        eval_dataset= MEDOCCANDataset(val_encodings, val_labels),
        compute_metrics=compute_metrics
    )

    # Train the model for this fold
    trainer_fold.train()

    # Evaluate the model for this fold
    evaluation_result = trainer_fold.evaluate()
    print(f"Evaluation result for fold {fold + 1}/{k}:")
    print(evaluation_result)
    test_predictions = trainer_fold.predict(test_dataset)
    all_test_predictions.append(test_predictions)
    model.save_pretrained(f"./model/best_model_fold_{fold + 1}")


['Token\tTag\tSentence #', 'Datos\tO\tSentence 1', 'del\tO\tSentence 2', 'paciente\tO\tSentence 2', '.\tO\tSentence 2', '"', '"\tO\tSentence 2', 'Nombre\tO\tSentence 2', ':\tO\tSentence 3', ' \tO\tSentence 3', 'Ignacio\tB-NOMBRE_SUJETO_ASISTENCIA\tSentence 3', '.\tO\tSentence 3', '"', '"\tO\tSentence 3', 'Apellidos\tO\tSentence 3', ':\tO\tSentence 4', 'Rico\tB-NOMBRE_SUJETO_ASISTENCIA\tSentence 4', 'Pedroza\tI-NOMBRE_SUJETO_ASISTENCIA\tSentence 4', '.\tO\tSentence 4', '"', '"\tO\tSentence 4', 'NHC\tO\tSentence 4', ':\tO\tSentence 5', '5467980\tB-ID_SUJETO_ASISTENCIA\tSentence 5', '.\tO\tSentence 5', '"', '"\tO\tSentence 5', 'Domicilio\tO\tSentence 5', ':\tO\tSentence 6', 'Av.\tB-CALLE\tSentence 6', 'Beniarda\tI-CALLE\tSentence 6', ',\tI-CALLE\tSentence 6', '13\tI-CALLE\tSentence 6', '.\tO\tSentence 6', '"', '"\tO\tSentence 6', 'Localidad/\tO\tSentence 6', 'Provincia\tO\tSentence 7', ':\tO\tSentence 7', 'Valencia\tB-TERRITORIO\tSentence 7', '.\tO\tSentence 7', '"', '"\tO\tSentence 7', '



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5553,0.665285,0.235167,0.14763,0.18139,0.888296
2,0.5796,0.665285,0.235167,0.14763,0.18139,0.888296
3,0.5763,0.665285,0.235167,0.14763,0.18139,0.888296


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation result for fold 1/3:
{'eval_loss': 0.6652845144271851, 'eval_precision': 0.23516720604099245, 'eval_recall': 0.14762979683972913, 'eval_f1': 0.1813895437526002, 'eval_accuracy': 0.8882963472676185, 'eval_runtime': 103.9491, 'eval_samples_per_second': 524.026, 'eval_steps_per_second': 65.503, 'epoch': 3.0}




Training fold 2/3




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5042,0.609359,0.717272,0.147178,0.24424,0.926391
2,0.5117,0.609359,0.717272,0.147178,0.24424,0.926391
3,0.5405,0.609359,0.717272,0.147178,0.24424,0.926391


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation result for fold 2/3:
{'eval_loss': 0.6093586683273315, 'eval_precision': 0.7172717271727173, 'eval_recall': 0.1471783295711061, 'eval_f1': 0.24424049447462073, 'eval_accuracy': 0.9263910392415468, 'eval_runtime': 104.1898, 'eval_samples_per_second': 522.815, 'eval_steps_per_second': 65.352, 'epoch': 3.0}




Training fold 3/3




Epoch,Training Loss,Validation Loss


In [None]:
# Calculate aggregated predictions on the test set
final_test_predictions = np.argmax(np.mean([pred.predictions for pred in all_test_predictions], axis=0), axis=-1)

# True labels for the test set
true_labels_test = [label for sublist in test_labels for label in sublist]

# Calculate metrics for the test set
test_classification_report = classification_report(true_labels_test, final_test_predictions)
test_confusion_matrix = confusion_matrix(true_labels_test, final_test_predictions)

# Print or use the classification report and confusion matrix
print("Test Classification Report:")
print(test_classification_report)

print("Test Confusion Matrix:")
print(test_confusion_matrix)

In [None]:
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup, AutoConfig, AutoModelForTokenClassification


# After saving, you can upload your model to the Hugging Face Model Hub using the following command in the terminal
!transformers-cli login  # Log in to your Hugging Face account
!transformers-cli repo create NER-BERT-MEDOCCAN-KFold  # Create a new repository for your model
!transformers-cli push './model/'  # Push your saved model to the Hub


In [None]:
'''
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()
'''