# **GenoVarDis@IberLEF2024: Automatic Genomic Variants and Related Diseases using Named Entity Recognition with Large Language Models**

## Autor: Víctor Manuel Oliveros Villena


In [None]:
# Esta es la ruta a la raíz de nuestro Drive.
# Si se prefiere otra ruta, añadir los directorios pertinentes a continuación de esta.
path_drive = '/content/drive/MyDrive'

### **Configuración del entorno**

In [None]:
!pip install gliner transformers

Collecting gliner
  Downloading gliner-0.2.2-py3-none-any.whl (30 kB)
Collecting flair==0.13.1 (from gliner)
  Downloading flair-0.13.1-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from gliner)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3>=1.20.27 (from flair==0.13.1->gliner)
  Downloading boto3-1.34.115-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.2 (from flair==0.13.1->gliner)
  Downloading bpemb-0.3.5-py3-none-any.whl (19 kB)
Collecting conllu>=4.0 (from flair==0.13.1->gliner)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting 

In [None]:
import numpy as np
import torch
import os
from google.colab import drive
import pandas as pd
from gliner import GLiNER
from transformers import BasicTokenizer
from tqdm import tqdm
from transformers import get_cosine_schedule_with_warmup

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Preprocesado**

### **Lectura de archivos**

In [None]:
# Ruta al archivo TSV en Google Drive
path_train_text = path_drive + '/GenoVarDis/Data/train_text.tsv'
path_train_annotation = path_drive + '/GenoVarDis/Data/train_annotation.tsv'
path_dev_text = path_drive + '/GenoVarDis/Data/dev_text.tsv'
path_dev_annotation = path_drive + '/GenoVarDis/Data/dev_annotation.tsv'

# Leer el archivo TSV utilizando pandas
train_text = pd.read_csv(path_train_text, sep='\t')
train_annotation = pd.read_csv(path_train_annotation, sep='\t')
dev_text = pd.read_csv(path_dev_text, sep='\t')
dev_annotation = pd.read_csv(path_dev_annotation, sep='\t')

### **Preprocesado para adaptar el formato de nuestros ejemplos al empleado por modelos GLiNER**

In [None]:
basic_tokenizer = BasicTokenizer()
data = np.array([], dtype=dict)

def tokenize_and_set_ids(example, type_set='train'):
  """
  Tokeniza el texto y asigna IDs y etiquetas NER basadas en anotaciones para un ejemplo dado.

  Args:
      example (dict): Un diccionario que contiene un caso clinico.
      type_set (str): Indica si el ejemplo es parte del conjunto de entrenamiento ('train') o validacion ('dev').

  Returns:
      pd.DataFrame: DataFrame actualizado con los tokens y etiquetas NER para el ejemplo.
  """
  global data
  indices = set()
  result = dict()

  pmid = example['pmid']    # Pmid del caso clinico
  text = example['text']    # Texto del caso clinico
  tokens = np.array(basic_tokenizer.tokenize(text)) # Tokeniza el texto
  tags = []

  # Toma de datos del archivo correspondiente segun la opcion escogida
  if type_set == 'dev':
    sort_df = dev_annotation[dev_annotation['pmid'] == pmid].sort_values(by='offset1')
  else:
    sort_df = train_annotation[train_annotation['pmid'] == pmid].sort_values(by='offset1')
  labels = sort_df['label'].values.astype(str)    # Tipos de entidades
  spans = sort_df['span'].values.astype(str)      # Entidades

  index = 0
  # Por cada entidad...
  for l, d in zip(labels, spans):
    span_split = np.array(basic_tokenizer.tokenize(d)) # Tokeniza la entidad
    # Busca todas las apariciones de la entidad en el caso clinico
    index = np.where(tokens == span_split[0])[0]
    found = False
    # Si la entidad solo es una palabra...
    if len(span_split) == 1:
      i = 0
      # Mientras que no encontremos el indice de aparicion correcto...
      while not found and i < len(index):
        # Si el indice es valido (no fue escogido previamente)...
        if index[i] not in indices and (len(indices) == 0 or index[i] > max(indices)):
          # Lo guardamos en la lista de indices asignados
          indices.add(index[i])
          # Asignamos el tipo de entidad segun el modelo GLiNER
          tags.append([index[i], index[i], l])
          # Marcamos que hemos encontrado un indice
          found = True
        i += 1
    # Si la entidad contiene varias palabras...
    else:
      k = 0
      # Mientras que no encontramos el indice de aparicion correcto...
      while not found and k < len(index):
        i = index[k]
        # Si el indice es valido (no fue escogido previamente)...
        if i not in indices and (len(indices) == 0 or i > max(indices)):
          # Si la totalidad de la entidad coincide...
          if np.array_equal(tokens[i:i+len(span_split)], span_split):
            # Marcamos los indices correspondientes como asignados
            indices.update(np.arange(i, i+len(span_split)))
            # Asignamos el tipo de entidad segun el modelo GLiNER
            tags.append([i, i+len(span_split)-1, l])
            found = True
        k += 1
  # Guardamos el resultado final en un formato compatible con los modelos GLiNER
  result['tokenized_text'] = tokens.tolist()
  result['ner'] = tags

  # Añadimos el resultado a nuestro dataframe
  data = np.append(data, result)

  return data.tolist()

In [None]:
# Aplicamos la funcion anterior a la totalidad del conjunto de entrenamiento
train_text.apply(tokenize_and_set_ids, axis=1)

0      [{'tokenized_text': ['12672033', '|', 't', '|'...
1      [{'tokenized_text': ['12672033', '|', 't', '|'...
2      [{'tokenized_text': ['12672033', '|', 't', '|'...
3      [{'tokenized_text': ['12672033', '|', 't', '|'...
4      [{'tokenized_text': ['12672033', '|', 't', '|'...
                             ...                        
422    [{'tokenized_text': ['12672033', '|', 't', '|'...
423    [{'tokenized_text': ['12672033', '|', 't', '|'...
424    [{'tokenized_text': ['12672033', '|', 't', '|'...
425    [{'tokenized_text': ['12672033', '|', 't', '|'...
426    [{'tokenized_text': ['12672033', '|', 't', '|'...
Length: 427, dtype: object

In [None]:
# Aplicamos la funcion anterior a la totalidad del conjunto de validacion
dev_text.apply(lambda row: tokenize_and_set_ids(row, type_set='dev'), axis=1)

0     [{'tokenized_text': ['12672033', '|', 't', '|'...
1     [{'tokenized_text': ['12672033', '|', 't', '|'...
2     [{'tokenized_text': ['12672033', '|', 't', '|'...
3     [{'tokenized_text': ['12672033', '|', 't', '|'...
4     [{'tokenized_text': ['12672033', '|', 't', '|'...
                            ...                        
65    [{'tokenized_text': ['12672033', '|', 't', '|'...
66    [{'tokenized_text': ['12672033', '|', 't', '|'...
67    [{'tokenized_text': ['12672033', '|', 't', '|'...
68    [{'tokenized_text': ['12672033', '|', 't', '|'...
69    [{'tokenized_text': ['12672033', '|', 't', '|'...
Length: 70, dtype: object

In [None]:
# Total de ejemplos: Ejemplos de entrenamiento + Ejemplos de validacion
print(len(data))

497


## **Entrenamiento del fine-tuning de GLiNER Medium**

In [None]:
# Cargamos el modelo GLiNER concreto que vamos a emplear, el Medium
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/781M [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [None]:
from types import SimpleNamespace

# Hiperparametros del modelo
config = SimpleNamespace(
    num_steps=20,
    train_batch_size=2,
    eval_every=1,
    save_directory= path_drive + "/GenoVarDis/Logs",
    warmup_ratio=0.1,
    device='cpu',
    lr_encoder=1e-5,
    lr_others=5e-5,
    freeze_token_rep=False,

    max_types=25,
    shuffle_types=True,
    random_drop=True,
    max_neg_type_ratio=1,
    max_len=700
)

In [None]:
def train(model, config, train_data, eval_data=None):
    """
    Entrena el modelo con los datos de entrenamiento y evalúa periódicamente con los datos de evaluación.

    Args:
        model: El modelo a entrenar.
        config: Configuración con los hiperparámetros de entrenamiento.
        train_data: Datos de entrenamiento.
        eval_data: Datos de evaluación.

    Returns:
        None
    """
    model = model.to(config.device)

    # Establece los hiperparametros del modelo
    model.set_sampling_params(
        max_types=config.max_types,
        shuffle_types=config.shuffle_types,
        random_drop=config.random_drop,
        max_neg_type_ratio=config.max_neg_type_ratio,
        max_len=config.max_len
    )

    # Modo entrenamiento
    model.train()

    # Inicializa los cargadores de datos
    train_loader = model.create_dataloader(train_data, batch_size=config.train_batch_size, shuffle=True)

    # Inicializa el optimizador
    optimizer = model.get_optimizer(config.lr_encoder, config.lr_others, config.freeze_token_rep)

    pbar = tqdm(range(config.num_steps)) # Barra de progreso para el entrenamiento

    # Calcula el numero de pasos de calentamiento
    if config.warmup_ratio < 1:
        num_warmup_steps = int(config.num_steps * config.warmup_ratio)
    else:
        num_warmup_steps = int(config.warmup_ratio)

    # Inicializa el scheduler con calentamiento y decaimiento cosenoidal
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=config.num_steps
    )

    iter_train_loader = iter(train_loader) # Iterador del cargador de datos

    for step in pbar:
        try:
            x = next(iter_train_loader) # Obtiene el siguiente batch
        except StopIteration:
            iter_train_loader = iter(train_loader) # Reinicia el iterador si llega el final
            x = next(iter_train_loader)

        # Mueve los datos al dispositivo configurado
        for k, v in x.items():
            if isinstance(v, torch.Tensor):
                x[k] = v.to(config.device)

        loss = model(x)  # Forward pass

        # Verifica si la perdida es NaN
        if torch.isnan(loss):
            continue

        loss.backward()  # Calculo los gradientes
        optimizer.step()  # Actualiza los parametros
        scheduler.step()  # Actualiza la programacion del learning rate
        optimizer.zero_grad()  # Resetea los gradientes

        # Actualiza la barra de progreso
        description = f"step: {step} | epoch: {step // len(train_loader)} | loss: {loss.item():.2f}"
        pbar.set_description(description)

        # Evalua el modelo periodicamente (conjunto de validacion)
        if (step + 1) % config.eval_every == 0:

            model.eval()

            if eval_data is not None:
                results, f1 = model.evaluate(eval_data["samples"], flat_ner=True, threshold=0.5, batch_size=12,
                                     entity_types=eval_data["entity_types"])

                print(f"Step={step}\n{results}")

            if not os.path.exists(config.save_directory):
                os.makedirs(config.save_directory)

            # Guarda el modelo
            if step == config.num_steps - 1:
              model.save_pretrained(f"{config.save_directory}/GLiNERMedium_DevVal")

            model.train()

In [None]:
# Diccionario con informacion sobre el conjunto de validacion y tipos de entidades
eval_data = {
    "entity_types": ["Gene", "Disease", "DNAMutation", "SNP", "DNAAllele", "NucleotideChange-BaseChange", "OtherMutation", "Transcript"],
    "samples": data[:int(len(data)*0.1)]
}

# Entrenamiento del modelo
train(model, config, data[int(len(data)*0.1):], eval_data)

step: 0 | epoch: 0 | loss: 47.03:   5%|▌         | 1/20 [05:10<1:38:22, 310.64s/it]

Step=0
P: 56.37%	R: 42.83%	F1: 48.67%



step: 1 | epoch: 0 | loss: 132.14:  10%|█         | 2/20 [10:29<1:34:41, 315.63s/it]

Step=1
P: 55.63%	R: 45.61%	F1: 50.12%



step: 2 | epoch: 0 | loss: 124.34:  15%|█▌        | 3/20 [15:33<1:27:57, 310.42s/it]

Step=2
P: 56.63%	R: 50.83%	F1: 53.58%



step: 3 | epoch: 0 | loss: 92.80:  20%|██        | 4/20 [20:36<1:21:59, 307.48s/it]

Step=3
P: 56.59%	R: 50.61%	F1: 53.44%



step: 4 | epoch: 0 | loss: 106.25:  25%|██▌       | 5/20 [25:32<1:15:50, 303.35s/it]

Step=4
P: 58.49%	R: 52.50%	F1: 55.33%



step: 5 | epoch: 0 | loss: 125.62:  30%|███       | 6/20 [29:40<1:06:20, 284.29s/it]

Step=5
P: 58.57%	R: 53.95%	F1: 56.17%



step: 6 | epoch: 0 | loss: 156.42:  35%|███▌      | 7/20 [33:33<57:58, 267.59s/it]  

Step=6
P: 59.09%	R: 56.40%	F1: 57.71%



step: 7 | epoch: 0 | loss: 97.44:  40%|████      | 8/20 [38:26<55:09, 275.83s/it]

Step=7
P: 63.26%	R: 57.84%	F1: 60.43%



step: 8 | epoch: 0 | loss: 141.27:  45%|████▌     | 9/20 [43:17<51:26, 280.58s/it]

Step=8
P: 67.38%	R: 59.29%	F1: 63.08%



step: 9 | epoch: 0 | loss: 99.15:  50%|█████     | 10/20 [48:20<47:53, 287.38s/it]

Step=9
P: 70.84%	R: 59.18%	F1: 64.48%



step: 10 | epoch: 0 | loss: 73.22:  55%|█████▌    | 11/20 [53:24<43:50, 292.29s/it]

Step=10
P: 73.48%	R: 59.18%	F1: 65.56%



step: 11 | epoch: 0 | loss: 123.53:  60%|██████    | 12/20 [58:30<39:34, 296.76s/it]

Step=11
P: 74.37%	R: 59.07%	F1: 65.84%



step: 12 | epoch: 0 | loss: 83.66:  65%|██████▌   | 13/20 [1:03:34<34:51, 298.81s/it]

Step=12
P: 74.20%	R: 59.18%	F1: 65.84%



step: 13 | epoch: 0 | loss: 58.75:  70%|███████   | 14/20 [1:08:36<29:58, 299.71s/it]

Step=13
P: 74.48%	R: 59.73%	F1: 66.30%



step: 14 | epoch: 0 | loss: 98.15:  75%|███████▌  | 15/20 [1:13:44<25:10, 302.20s/it]

Step=14
P: 73.93%	R: 59.29%	F1: 65.80%



step: 15 | epoch: 0 | loss: 68.41:  80%|████████  | 16/20 [1:18:49<20:12, 303.05s/it]

Step=15
P: 74.10%	R: 59.51%	F1: 66.01%



step: 16 | epoch: 0 | loss: 80.67:  85%|████████▌ | 17/20 [1:23:50<15:07, 302.42s/it]

Step=16
P: 73.90%	R: 59.51%	F1: 65.93%



step: 17 | epoch: 0 | loss: 59.87:  90%|█████████ | 18/20 [1:29:09<10:15, 307.56s/it]

Step=17
P: 73.69%	R: 59.51%	F1: 65.85%



step: 18 | epoch: 0 | loss: 91.52:  95%|█████████▌| 19/20 [1:34:19<05:08, 308.08s/it]

Step=18
P: 73.49%	R: 59.51%	F1: 65.77%



step: 19 | epoch: 0 | loss: 59.57:  95%|█████████▌| 19/20 [1:34:39<05:08, 308.08s/it]

Step=19
P: 73.49%	R: 59.51%	F1: 65.77%



step: 19 | epoch: 0 | loss: 59.57: 100%|██████████| 20/20 [1:39:22<00:00, 298.11s/it]
