# **GenoVarDis@IberLEF2024: Automatic Genomic Variants and Related Diseases using Named Entity Recognition with Large Language Models**

## Autor: Víctor Manuel Oliveros Villena


In [None]:
# Esta es la ruta a la raíz de nuestro Drive.
# Si se prefiere otra ruta, añadir los directorios pertinentes a continuación de esta.
path_drive = '/content/drive/MyDrive'

### **Configuración del entorno**

In [None]:
!pip install transformers datasets evaluate seqeval accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import math
from google.colab import drive
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
from transformers import BasicTokenizer
from datasets import Dataset, DatasetDict

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Preprocesado**


### **Lectura de los archivos**

In [None]:
# Ruta al archivo TSV en tu Google Drive
path_train_text = path_drive + '/GenoVarDis/Data/train_text.tsv'
path_train_annotation = path_drive + '/GenoVarDis/Data/train_annotation.tsv'

# Leer el archivo TSV utilizando pandas
train_text = pd.read_csv(path_train_text, sep='\t')
train_annotation = pd.read_csv(path_train_annotation, sep='\t')

### **Diccionarios**

In [None]:
# Diccionario para asignar un índice a cada tipo de entidad según el esquema BIO
idlabel = {
    "Disease": 1,
    "Gene": 3,
    "DNAMutation": 5,
    "SNP": 7,
    "DNAAllele": 9,
    "NucleotideChange-BaseChange": 11,
    "OtherMutation": 13,
    "Transcript": 15,
}

# Diccionario para obtener el tipo de entidad a partir del indice
id2label = {
    0: "O",
    1: "B-disease",
    2: "I-disease",
    3: "B-gene",
    4: "I-gene",
    5: "B-DNAMutation",
    6: "I-DNAMutation",
    7: "B-SNP",
    8: "I-SNP",
    9: "B-DNAAllele",
    10: "I-DNAAllele",
    11: "B-NucleotideChange-BaseChange",
    12: "I-NucleotideChange-BaseChange",
    13: "B-OtherMutation",
    14: "I-OtherMutation",
    15: "B-Transcript",
    16: "I-Transcript",
}

# Diccionario para obtener el indice a partir del tipo de entidad
label2id = {
    "O": 0,
    "B-disease": 1,
    "I-disease": 2,
    "B-gene": 3,
    "I-gene": 4,
    "B-DNAMutation": 5,
    "I-DNAMutation": 6,
    "B-SNP": 7,
    "I-SNP": 8,
    "B-DNAAllele": 9,
    "I-DNAAllele": 10,
    "B-NucleotideChange-BaseChange": 11,
    "I-NucleotideChange-BaseChange": 12,
    "B-OtherMutation": 13,
    "I-OtherMutation": 14,
    "B-Transcript": 15,
    "I-Transcript": 16,
}

In [None]:
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()
# train_annotation = train_annotation[train_annotation['label'] == 'Disease']
data_df = pd.DataFrame(columns=['id', 'tokens', 'ner_tags'])

def tokenize_and_set_ids(example):
  """
  Tokeniza el texto y asigna IDs y etiquetas NER basadas en anotaciones para un ejemplo dado.

  Args:
      example (dict): Un diccionario que contiene un caso clinico.
      type_set (str): Indica si el ejemplo es parte del conjunto de entrenamiento ('train') o validacion ('dev').

  Returns:
      pd.DataFrame: DataFrame actualizado con los tokens y etiquetas NER para el ejemplo.
  """
  global data_df
  indices = []

  pmid = example['pmid']    # Pmid del caso clinico
  text = example['text']    # Texto del caso clinico
  tokens = np.array(basic_tokenizer.tokenize(text))   # Tokeniza el texto
  tags = np.zeros(len(tokens), dtype=int)             # Inicializa etiquetas a 0

  # Toma de datos del archivo correspondiente segun la opcion escogida
  train_sort_df = train_annotation[train_annotation['pmid'] == pmid].sort_values(by='offset1')
  labels = train_sort_df['label'].values.astype(str)  # Tipos de entidades
  spans = train_sort_df['span'].values.astype(str)    # Entidades

  index = 0
  # Por cada entidad...
  for l, d in zip(labels, spans):
    span_split = np.array(basic_tokenizer.tokenize(d))  # Tokeniza la entidad
    # Busca todas las apariciones de la entidad en el caso clinico
    index = np.where(tokens == span_split[0])[0]
    found = False
    # Si la entidad solo es una palabra...
    if len(d) == 1:
      i = 0
      # Mientras que no encontremos el indice de aparicion correcto...
      while not found and i < len(index):
        # Si el indice es valido (no fue escogido previamente)...
        if index[i] not in indices and (len(indices) == 0 or index[i] > max(indices)):
          # Lo guardamos en la lista de indices asignados
          indices.append(index[i])
          # Marcamos que hemos encontrado un indice
          found = True
          # Asignamos el tipo de entidad segun el esquema BIO
          tags[index[i]] = idlabel[l]
        i += 1
    # Si la entidad contiene varias palabras...
    else:
      k = 0
      # Mientras que no encontramos el indice de aparicion correcto...
      while not found and k < len(index):
        i = index[k]
        # Si el indice es valido (no fue escogido previamente)...
        if i not in indices and (len(indices) == 0 or i > max(indices)):
          # Si la totalidad de la entidad coincide...
          if np.array_equal(tokens[i:i+len(span_split)], span_split):
            # Marcamos los indices correspondientes como asignados
            index_range = np.arange(i, i+len(span_split))
            indices.extend(index_range)
            # Marcamos que hemos encontrado el indice
            found = True
            # Asignamos los tipos de entidades segun el esquema BIO
            tags[index_range[0]] = idlabel[l]
            tags[list(index_range[1:])] = idlabel[l] + 1
        k += 1

  # Guardamos el resultado final en un formato compatible con los modelos RoBERTa
  result = {'id': str(pmid), 'tokens':tokens, 'ner_tags':tags}

  # Añadimos el resultado a nuestro dataframe
  data_df.loc[len(data_df)] = result

  return data_df


train_text.apply(tokenize_and_set_ids, axis=1)

0                 id                                 ...
1                 id                                 ...
2                 id                                 ...
3                 id                                 ...
4                 id                                 ...
                             ...                        
422               id                                 ...
423               id                                 ...
424               id                                 ...
425               id                                 ...
426               id                                 ...
Length: 427, dtype: object

In [None]:
# Total de ejemplos: Ejemplos de entrenamiento
len(data_df)

427

### **Empleo de estructuras de datos más eficientes**

In [None]:
# Conjunto de entrenamiento de tipo Dataset de HuggingFace
data = Dataset.from_pandas(data_df)
# Creamos un conjunto de validacion del 10%
data = data.train_test_split(test_size=0.1)

data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 384
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 43
    })
})

### **Tokenización para RoBERTa**

In [None]:
# Tokenizador especifico de RoBERTa del PlanTL
tokenizer = AutoTokenizer.from_pretrained("BSC-TeMU/roberta-base-biomedical-es")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/542k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# Lista con todos los tipos de entidades segun el esquema BIO
label_list = [
    "O",
    "B-disease",
    "I-disease",
    "B-gene",
    "I-gene",
    "B-DNAMutation",
    "I-DNAMutation",
    "B-SNP",
    "I-SNP",
    "B-DNAAllele",
    "I-DNAAllele",
    "B-NucleotideChange-BaseChange",
    "I-NucleotideChange-BaseChange",
    "B-OtherMutation",
    "I-OtherMutation",
    "B-Transcript",
    "I-Transcript",
]

In [None]:
def tokenize_and_align_labels(examples):
    """
    Tokeniza las entradas y alinea las etiquetas NER con los tokens.

    Args:
        examples (dict): Un diccionario que contiene los tokens y etiquetas NER.

    Returns:
        dict: Un diccionario con los tokens tokenizados y las etiquetas alineadas.
    """
    # Tokenizacion de las entradas segun RoBERTa
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    # Por cada ner tag de la entidad...
    for i, label in enumerate(examples[f"ner_tags"]):
        # Mapeo de los tokens a sus respectivas palabras
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                # Asigna -100 a los token especiales
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Etiqueta solo el primer token de la palabra
                label_ids.append(label[word_idx])
            else:
                # Asigna -100 al resto de tokens de la palabra
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Mapeamos el metodo al conjunto de ejemplos de entrenamiento
tokenized_data = data.map(tokenize_and_align_labels, batched=True)

seqeval = evaluate.load("seqeval")

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def compute_metrics(p):
    """
    Calcula métricas de evaluación para las predicciones del modelo RoBERTa.

    Args:
        p (tuple): Un tuple que contiene las predicciones y las etiquetas verdaderas.

    Returns:
        dict: Un diccionario con las métricas de precisión, recall, F1 y exactitud.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Filtra predicciones y obtiene las verdaderas
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Filtra las etiquetas y obtiene las verdaderas
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Calcula las metricas
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Cargamos el modelo RoBERTa concreto que vamos a emplear
model = AutoModelForTokenClassification.from_pretrained(
    "BSC-TeMU/roberta-base-biomedical-es", num_labels=17, id2label=id2label, label2id=label2id
)

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at BSC-TeMU/roberta-base-biomedical-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Entrenamiento del fine-tuning de RoBERTa**

In [None]:
# Establecemos los hiperparametros del modelo
training_args = TrainingArguments(
    output_dir= path_drive + "/GenoVarDis/Logs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=20
)

# Creamos un objeto Trainer con toda la informacion necesaria para el entrenamiento
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Entrenamos y guardamos el modelo
trainer.train()
trainer.save_model(path_drive + "/GenoVarDis/Logs/RoBERTa_DevTest")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.5114,0.665041,0.0,0.0,0.0,0.871055
2,0.6306,0.450983,0.472941,0.258687,0.334443,0.886588
3,0.4378,0.315294,0.587886,0.637066,0.611489,0.922051
4,0.3441,0.244024,0.591966,0.720721,0.650029,0.940421
5,0.2229,0.206363,0.65442,0.733591,0.691748,0.944961
6,0.1976,0.194973,0.661435,0.759331,0.70701,0.948933
7,0.1733,0.184059,0.694282,0.765766,0.728274,0.951628
8,0.1499,0.184401,0.672626,0.774775,0.720096,0.950777
9,0.1406,0.185036,0.698026,0.773488,0.733822,0.949926
10,0.1235,0.191279,0.685714,0.803089,0.739775,0.947727


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=480, training_loss=0.22535902659098309, metrics={'train_runtime': 768.2373, 'train_samples_per_second': 9.997, 'train_steps_per_second': 0.625, 'total_flos': 2006353406152416.0, 'train_loss': 0.22535902659098309, 'epoch': 20.0})