# **GenoVarDis@IberLEF2024: Automatic Genomic Variants and Related Diseases using Named Entity Recognition with Large Language Models**

## Autor: Víctor Manuel Oliveros Villena


In [None]:
# Esta es la ruta a la raíz de nuestro Drive.
# Si se prefiere otra ruta, añadir los directorios pertinentes a continuación de esta.
path_drive = '/content/drive/MyDrive'

### **Configuración del entorno**

In [None]:
!pip install gliner transformers beartype

Collecting gliner
  Downloading gliner-0.2.2-py3-none-any.whl (30 kB)
Collecting beartype
  Downloading beartype-0.18.5-py3-none-any.whl (917 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m917.8/917.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting flair==0.13.1 (from gliner)
  Downloading flair-0.13.1-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from gliner)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3>=1.20.27 (from flair==0.13.1->gliner)
  Downloading boto3-1.34.116-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.2 

In [None]:
import numpy as np
import torch
import sys
from google.colab import drive
import pandas as pd
from gliner import GLiNER
from transformers import BasicTokenizer

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Preprocesado**

In [None]:
sys.path.append(path_drive + "/GenoVarDis/Prototypes")
from trainer import GlinerTrainer

In [None]:
# Ruta al archivo TSV en Google Drive
path_train_text = path_drive + '/GenoVarDis/Data/train_text.tsv'
path_train_annotation = path_drive + '/GenoVarDis/Data/train_annotation.tsv'
path_dev_text = path_drive + '/GenoVarDis/Data/dev_text.tsv'
path_dev_annotation = path_drive + '/GenoVarDis/Data/dev_annotation.tsv'

# Leer el archivo TSV utilizando pandas
train_text = pd.read_csv(path_train_text, sep='\t')
train_annotation = pd.read_csv(path_train_annotation, sep='\t')
dev_text = pd.read_csv(path_dev_text, sep='\t')
dev_annotation = pd.read_csv(path_dev_annotation, sep='\t')

### **Preprocesado para adaptar el formato de nuestros ejemplos al empleado por modelos GLiNER**

In [None]:
basic_tokenizer = BasicTokenizer()
data = np.array([], dtype=dict)

def tokenize_and_set_ids(example, type_set='train'):
  """
  Tokeniza el texto y asigna IDs y etiquetas NER basadas en anotaciones para un ejemplo dado.

  Args:
      example (dict): Un diccionario que contiene un caso clinico.
      type_set (str): Indica si el ejemplo es parte del conjunto de entrenamiento ('train') o validacion ('dev').

  Returns:
      pd.DataFrame: DataFrame actualizado con los tokens y etiquetas NER para el ejemplo.
  """
  global data
  indices = set()
  result = dict()

  pmid = example['pmid']    # Pmid del caso clinico
  text = example['text']    # Texto del caso clinico
  tokens = np.array(basic_tokenizer.tokenize(text)) # Tokeniza el texto
  tags = []

  # Toma de datos del archivo correspondiente segun la opcion escogida
  if type_set == 'dev':
    sort_df = dev_annotation[dev_annotation['pmid'] == pmid].sort_values(by='offset1')
  else:
    sort_df = train_annotation[train_annotation['pmid'] == pmid].sort_values(by='offset1')
  labels = sort_df['label'].values.astype(str)    # Tipos de entidades
  spans = sort_df['span'].values.astype(str)      # Entidades

  index = 0
  # Por cada entidad...
  for l, d in zip(labels, spans):
    span_split = np.array(basic_tokenizer.tokenize(d)) # Tokeniza la entidad
    # Busca todas las apariciones de la entidad en el caso clinico
    index = np.where(tokens == span_split[0])[0]
    found = False
    # Si la entidad solo es una palabra...
    if len(span_split) == 1:
      i = 0
      # Mientras que no encontremos el indice de aparicion correcto...
      while not found and i < len(index):
        # Si el indice es valido (no fue escogido previamente)...
        if index[i] not in indices and (len(indices) == 0 or index[i] > max(indices)):
          # Lo guardamos en la lista de indices asignados
          indices.add(index[i])
          # Asignamos el tipo de entidad segun el modelo GLiNER
          tags.append([index[i], index[i], l])
          # Marcamos que hemos encontrado un indice
          found = True
        i += 1
    # Si la entidad contiene varias palabras...
    else:
      k = 0
      # Mientras que no encontramos el indice de aparicion correcto...
      while not found and k < len(index):
        i = index[k]
        # Si el indice es valido (no fue escogido previamente)...
        if i not in indices and (len(indices) == 0 or i > max(indices)):
          # Si la totalidad de la entidad coincide...
          if np.array_equal(tokens[i:i+len(span_split)], span_split):
            # Marcamos los indices correspondientes como asignados
            indices.update(np.arange(i, i+len(span_split)))
            # Asignamos el tipo de entidad segun el modelo GLiNER
            tags.append([i, i+len(span_split)-1, l])
            found = True
        k += 1
  # Guardamos el resultado final en un formato compatible con los modelos GLiNER
  result['tokenized_text'] = tokens.tolist()
  result['ner'] = tags

  # Añadimos el resultado a nuestro dataframe
  data = np.append(data, result)

  return data.tolist()

In [None]:
# Aplicamos la funcion anterior a la totalidad del conjunto de entrenamiento
train_text.apply(tokenize_and_set_ids, axis=1)

0      [{'tokenized_text': ['12672033', '|', 't', '|'...
1      [{'tokenized_text': ['12672033', '|', 't', '|'...
2      [{'tokenized_text': ['12672033', '|', 't', '|'...
3      [{'tokenized_text': ['12672033', '|', 't', '|'...
4      [{'tokenized_text': ['12672033', '|', 't', '|'...
                             ...                        
422    [{'tokenized_text': ['12672033', '|', 't', '|'...
423    [{'tokenized_text': ['12672033', '|', 't', '|'...
424    [{'tokenized_text': ['12672033', '|', 't', '|'...
425    [{'tokenized_text': ['12672033', '|', 't', '|'...
426    [{'tokenized_text': ['12672033', '|', 't', '|'...
Length: 427, dtype: object

In [None]:
# Aplicamos la funcion anterior a la totalidad del conjunto de validacion
dev_text.apply(lambda row: tokenize_and_set_ids(row, type_set='dev'), axis=1)

0     [{'tokenized_text': ['12672033', '|', 't', '|'...
1     [{'tokenized_text': ['12672033', '|', 't', '|'...
2     [{'tokenized_text': ['12672033', '|', 't', '|'...
3     [{'tokenized_text': ['12672033', '|', 't', '|'...
4     [{'tokenized_text': ['12672033', '|', 't', '|'...
                            ...                        
65    [{'tokenized_text': ['12672033', '|', 't', '|'...
66    [{'tokenized_text': ['12672033', '|', 't', '|'...
67    [{'tokenized_text': ['12672033', '|', 't', '|'...
68    [{'tokenized_text': ['12672033', '|', 't', '|'...
69    [{'tokenized_text': ['12672033', '|', 't', '|'...
Length: 70, dtype: object

In [None]:
# Total de ejemplos: Ejemplos de entrenamiento + Ejemplos de validacion
print(len(data))

497


In [None]:
# Pasamos los datos a lista primitiva de Python para ser compatible con Trainer.py
data = data.tolist()

## **Entrenamiento del fine-tuning de GLiNER Medium**

In [None]:
# Cargamos el modelo GLiNER concreto que vamos a emplear, el Medium
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/781M [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [None]:
# Establecemos el dispositivo donde ejecutar el modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Diccionario con informacion sobre el conjunto de validacion y tipos de entidades
eval_data = {
    "entity_types": ["Gene", "Disease", "DNAMutation", "SNP", "DNAAllele", "NucleotideChange-BaseChange", "OtherMutation", "Transcript"],
    "samples": data[:int(len(data)*0.1)]
}

# Establecemos los hiperparametros del modelo
trainer = GlinerTrainer(model,
                        train_data = data[int(len(data)*0.1):],
                        batch_size = 2,
                        grad_accum_every = 16,
                        lr_encoder = 1e-5,
                        lr_others = 5e-5,
                        freeze_token_rep = False,
                        val_every_step = 224,
                        val_data = eval_data,
                        checkpoint_every_epoch = 10, # Or checkpoint_every_step if you use steps
                        max_types=25,
                        max_len=700,
)

torch.cuda.empty_cache()

trainer.train(num_epochs=20)

Training for 4480 steps which is 20 epochs.


Epoch 1/20: 100%|██████████| 224/224 [02:02<00:00,  1.83it/s, average_loss=104, loss=96.1, step=224]


Step=224
P: 68.34%	R: 55.95%	F1: 61.53%

Epoch 1 average loss: 104.48708699430738


Epoch 2/20: 100%|██████████| 224/224 [02:00<00:00,  1.85it/s, average_loss=79.2, loss=96.3, step=448]


Step=448
P: 68.43%	R: 61.96%	F1: 65.03%

Epoch 2 average loss: 79.23309877940586


Epoch 3/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=64.8, loss=42.3, step=672]


Step=672
P: 69.35%	R: 65.18%	F1: 67.20%

Epoch 3 average loss: 64.8128315125193


Epoch 4/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=56.1, loss=26.2, step=896]


Step=896
P: 71.58%	R: 68.08%	F1: 69.78%

Epoch 4 average loss: 56.11617525134768


Epoch 5/20: 100%|██████████| 224/224 [02:00<00:00,  1.85it/s, average_loss=49.5, loss=82.9, step=1120]


Step=1120
P: 71.81%	R: 68.85%	F1: 70.30%

Epoch 5 average loss: 49.48051529271262


Epoch 6/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=45.3, loss=43.9, step=1344]


Step=1344
P: 70.94%	R: 69.52%	F1: 70.22%

Epoch 6 average loss: 45.348886179072515


Epoch 7/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=41.7, loss=57.4, step=1568]


Step=1568
P: 70.34%	R: 70.97%	F1: 70.65%

Epoch 7 average loss: 41.7455664404801


Epoch 8/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=39.1, loss=28.1, step=1792]


Step=1792
P: 72.16%	R: 70.63%	F1: 71.39%

Epoch 8 average loss: 39.051191576889586


Epoch 9/20: 100%|██████████| 224/224 [02:00<00:00,  1.86it/s, average_loss=36.5, loss=30, step=2016]


Step=2016
P: 73.37%	R: 71.41%	F1: 72.38%

Epoch 9 average loss: 36.47751698323658


Epoch 10/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=34.4, loss=44.4, step=2240]


Step=2240
P: 74.06%	R: 72.41%	F1: 73.23%

Epoch 10 average loss: 34.41301850335939


Epoch 11/20: 100%|██████████| 224/224 [02:01<00:00,  1.85it/s, average_loss=33.4, loss=34.2, step=2464]


Step=2464
P: 73.26%	R: 73.75%	F1: 73.50%

Epoch 11 average loss: 33.40691359447582


Epoch 12/20: 100%|██████████| 224/224 [02:01<00:00,  1.85it/s, average_loss=32.6, loss=18.7, step=2688]


Step=2688
P: 74.01%	R: 72.86%	F1: 73.43%

Epoch 12 average loss: 32.57487925887108


Epoch 13/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=31.1, loss=36.4, step=2912]


Step=2912
P: 74.01%	R: 72.53%	F1: 73.26%

Epoch 13 average loss: 31.12333712407521


Epoch 14/20: 100%|██████████| 224/224 [02:01<00:00,  1.85it/s, average_loss=30.5, loss=44.9, step=3136]


Step=3136
P: 75.47%	R: 72.19%	F1: 73.79%

Epoch 14 average loss: 30.492397086960928


Epoch 15/20: 100%|██████████| 224/224 [02:02<00:00,  1.83it/s, average_loss=29.8, loss=20.3, step=3360]


Step=3360
P: 75.73%	R: 72.53%	F1: 74.09%

Epoch 15 average loss: 29.841055738074438


Epoch 16/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=28.5, loss=46.2, step=3584]


Step=3584
P: 75.11%	R: 73.53%	F1: 74.31%

Epoch 16 average loss: 28.472298826490128


Epoch 17/20: 100%|██████████| 224/224 [02:01<00:00,  1.85it/s, average_loss=29.2, loss=6.61, step=3808]


Step=3808
P: 75.89%	R: 73.19%	F1: 74.52%

Epoch 17 average loss: 29.196867606469564


Epoch 18/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=28.7, loss=19.9, step=4032]


Step=4032
P: 75.93%	R: 72.97%	F1: 74.42%

Epoch 18 average loss: 28.731610421623504


Epoch 19/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=28.3, loss=34.2, step=4256]


Step=4256
P: 75.60%	R: 73.08%	F1: 74.32%

Epoch 19 average loss: 28.307048923202924


Epoch 20/20: 100%|██████████| 224/224 [02:01<00:00,  1.84it/s, average_loss=28.8, loss=55.7, step=4480]

Step=4480
P: 75.60%	R: 73.08%	F1: 74.32%

Epoch 20 average loss: 28.76741732444082





28.76741732444082

In [None]:
# Guardamos el modelo
trainer.model.save_pretrained(path_drive + "/GenoVarDis/Logs/GLiNERMedium_Trainer")