In [None]:
!pip install datasets
!pip install ray[tune]
!pip install pyngrok



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import math
import torch.nn as nn
import itertools
from ray import tune
from ray import train
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
import psutil
from typing import List, Dict, Tuple, Optional, Any
from pyngrok import ngrok
import os
import shutil
import matplotlib.pyplot as plt

model_names: List[str] = [
  "microsoft/DialoGPT-small",
  # "microsoft/DialoGPT-medium"
  ]

# each model is imported (pre trained version), as well as the tokenizers (model-specific)
models: Dict[str, AutoModelForCausalLM] = {name: AutoModelForCausalLM.from_pretrained(name) for name in model_names}
tokenizers: Dict[str, AutoTokenizer] = {name: AutoTokenizer.from_pretrained(name) for name in model_names}

In [None]:
# sometimes the pad_token is not included in tokenizer (end of sequence token is chosen)
for name, tokenizer in tokenizers.items():
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# here, the real conversations will be loaded
conversations: List[Dict[str, str]] = [
  {"input": "Hello", "output": "Hello, can I help you?"}
]

conversations_val: List[Dict[str, str]] = [
  {"input": "Good morning", "output": "Good morning, can I help you?"}
]

In [None]:
def get_max_length(tokenizer: AutoTokenizer, conversations: List[Dict[str, str]]) -> int:
    lengths = []
    for conversation in conversations:
        input_length = len(tokenizer(conversation["input"], truncation=False)["input_ids"])
        output_length = len(tokenizer(conversation["output"], truncation=False)["input_ids"])
        lengths.append(max(input_length, output_length))
    return max(lengths)

max_length = get_max_length(tokenizer, conversaciones + conversaciones_val)

In [None]:
# depending on the model expected input, the way the conversations are tokenized is different (labels is what we are trying to predict)
def preprocess_conversations(tokenizer: AutoTokenizer, conversations: List[Dict[str, str]], max_length: int) -> Dataset:
  input_texts: List[str] = []
  target_texts: List[str] = []

  for conversation in conversations:
    input_texts.append(conversation["input"])
    target_texts.append(conversation["output"])

  tokenized_inputs: Dict[str, torch.Tensor] = tokenizer(input_texts, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
  tokenized_outputs: Dict[str, torch.Tensor] = tokenizer(target_texts, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")

  dataset: Dataset = Dataset.from_dict({
    "input_ids": tokenized_inputs["input_ids"],
    "attention_mask": tokenized_inputs["attention_mask"],
    "labels": tokenized_outputs["input_ids"]
  })

  return dataset

tokenized_datasets: Dict[str, Dataset] = {name: preprocess_conversations(tokenizers[name], conversations, max_length) for name in model_names}
tokenized_datasets_val: Dict[str, Dataset] = {name: preprocess_conversations(tokenizers[name], conversations_val, max_length) for name in model_names}

In [None]:
# a custom trainer method is needed because sometimes the tensors are stored in different physical devices, so cloning the weights is needed
# before saving the model
class CustomTrainer(Trainer):
  def _save(self, output_dir: str, state_dict: Optional[Dict[str, torch.Tensor]]=None) -> None:
    if hasattr(self.model, "base_model"):
      if hasattr(self.model.base_model, "lm_head") and hasattr(self.model.base_model, "transformer"):
        if self.model.base_model.lm_head.weight is self.model.base_model.transformer.wte.weight:
          self.model.base_model.lm_head.weight = torch.nn.Parameter(self.model.base_model.lm_head.weight.clone())

    super()._save(output_dir, state_dict)

In [None]:
# custom training model
def train_model(model: nn.Module, custom_name:str, tokenizer: AutoTokenizer, tokenized_dataset: Dataset, tokenized_dataset_val: Dataset) -> None:
  # Configurar los argumentos del entrenamiento
  training_args: TrainingArguments = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,  # each epoch has (len(conversations) / batch_size) steps
    per_device_train_batch_size=1,
    logging_dir='./logs',
    logging_steps=1,
    save_steps=1,
    save_total_limit=1,
    prediction_loss_only=True,
    report_to="tensorboard"
  )

  # data collator to apply pad token to input sequences
  data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer) # type: ignore
  trainer: Trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator
  )

  trainer.train()
  # store model under folder with the name
  trainer.save_model(f"./results/{custom_name}")

In [None]:
class CustomTransformerModel(nn.Module):
  def __init__(self, base_model_name: str, custom_layer: bool, num_heads: int, num_layers: int) -> None:
    super(CustomTransformerModel, self).__init__()
    self.base_model: AutoModelForCausalLM  = AutoModelForCausalLM.from_pretrained(base_model_name, output_hidden_states=True)
    self.custom_layer: bool = custom_layer
    if self.custom_layer:
      self.d_model: int = self.base_model.config.hidden_size
      self.transformer_encoder: nn.TransformerEncoder = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(nhead=num_heads, d_model=self.d_model, batch_first=True),
        num_layers=num_layers)
      self.fc: nn.Linear = nn.Linear(self.d_model, self.base_model.config.vocab_size)

  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    # feed the input to the base model
    outputs: torch.Tensor = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
    # obtain outputs from base model
    logits: torch.Tensor = None
    if self.custom_layer:
      hidden_states: torch.Tensor = outputs.hidden_states[-1]
      # feed base model outputs to additional layer
      transformer_output: torch.Tensor = self.transformer_encoder(hidden_states.permute(1, 0, 2))
      output: torch.Tensor = transformer_output.permute(1, 0, 2)
      logits = self.fc(output)  # feed the output to extra linear layer
    else:
      logits = outputs.logits
    # compute loss (cross entropy)
    loss_fct: nn.CrossEntropyLoss = nn.CrossEntropyLoss(ignore_index=-100)
    # flatten logits and labels
    logits = logits.view(-1, self.base_model.config.vocab_size)
    labels = labels.view(-1)
    loss: torch.Tensor = loss_fct(logits, labels)
    return (loss, logits)

In [None]:
def search_space() -> Dict[str, Any]:
  return {
    "num_heads": tune.choice([1]),
    "num_layers": tune.choice([1]),
  }

In [None]:
def evaluate_hyperparameters(config: Dict[str, Any], model_name: str, custom_layer: bool) -> None:
  # create model with current hyperparameter configuration
  num_heads: int = config["num_heads"]
  num_layers: int = config["num_layers"]

  model: CustomTransformerModel  = CustomTransformerModel(model_name, custom_layer, num_heads=num_heads, num_layers=num_layers)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # train model
  custom_name: str = model_name
  if custom_layer:
    custom_name += "_custom_layer" + "_num_heads_" + str(num_heads) + "_num_layers_" + str(num_layers)
  else:
    custom_name += "_base_model"
  train_model(model, custom_name, tokenizers[model_name], tokenized_datasets[model_name], tokenized_datasets_val[model_name])

  # compute loss on validation set
  validation_data = tokenized_datasets_val[model_name]

  # feed validation data to model
  with torch.no_grad():
    input_ids: torch.Tensor = torch.tensor(validation_data["input_ids"]).to(device)  # Convertir a tensor
    attention_mask: torch.Tensor = torch.tensor(validation_data["attention_mask"]).to(device)  # Convertir a tensor
    labels: torch.Tensor = torch.tensor(validation_data["labels"]).to(device)  # Convertir a tensor

    outputs: Tuple[torch.Tensor, torch.Tensor] = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )

  # flatten logits and labels to compute loss and accuracy
  logits: torch.Tensor = outputs[1].view(-1, outputs[1].size(-1))
  labels = labels.view(-1)

  loss_fct: nn.CrossEntropyLoss = nn.CrossEntropyLoss(ignore_index=-100)
  validation_loss: torch.Tensor = loss_fct(logits, labels)

  # report metrics to Ray Tune
  train.report({"loss": validation_loss.item()})

In [None]:
def tune_hyperparameters(model_name: str, custom_layer: bool) -> Any:
  # define search space
  search_config: Dict[str, Any] = search_space()

  # define scheduler
  scheduler: ASHAScheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=1,  # max epochs number
    grace_period=1,  # min epochs number before halting trial
    reduction_factor=2
  )

  # log results during tuning
  reporter: CLIReporter = CLIReporter(
    metric_columns=["loss", "training_iteration"]
  )

  # execute actual search
  result: Any = tune.run(
    tune.with_parameters(evaluate_hyperparameters, model_name=model_name, custom_layer=custom_layer),
    config=search_config,
    num_samples=1,  # number of hyperparameter configuration to try
    scheduler=scheduler,
    progress_reporter=reporter,
    resources_per_trial={"cpu": psutil.cpu_count(logical=True), "gpu": torch.cuda.device_count()}  # use max number of cpus and gpus
  )

  # obtain best results
  best_trial: Any = result.get_best_trial("loss", "min", "last")
  print(f"Best configuration: {best_trial.config}")
  print(f"Best loss: {best_trial.last_result['loss']}")
  return best_trial

In [None]:
def train_models(custom_layer: bool) -> Dict[str, float]:
  trained_models: Dict[str, float] = {}

  for model_name in model_names:

    best_trial: Any = tune_hyperparameters(model_name, custom_layer)

    num_heads: int = best_trial.config["num_heads"]
    num_layers: int = best_trial.config["num_layers"]
    custom_name: str = model_name
    if custom_layer:
      custom_name += "_custom_layer" + "_num_heads_" + str(num_heads) + "_num_layers_" + str(num_layers)
    else:
      custom_name += "_base_model"

    # train model with best hyperparameter configuration
    model: CustomTransformerModel = CustomTransformerModel(
      model_name,
      custom_layer,
      num_heads=num_heads,
      num_layers=num_layers
    )

    # during the hyperparameter search, the model were already saved??
    train_model(model, custom_name, tokenizers[name], tokenized_datasets[name], tokenized_datasets_val[model_name])

    trained_models[custom_name] = (best_trial.last_result['loss'])  # type: ignore
  return trained_models

In [None]:
def main_trainer() -> Dict[str, float]:
  return train_models(custom_layer=False) | train_models(custom_layer=True) # type: ignore

In [None]:
comparison_dict: Dict[str, float] = main_trainer()



+---------------------------------------------------------------------------------+
| Configuration for experiment     evaluate_hyperparameters_2024-10-17_10-43-19   |
+---------------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                          |
| Scheduler                        AsyncHyperBandScheduler                        |
| Number of trials                 1                                              |
+---------------------------------------------------------------------------------+

View detailed results here: /root/ray_results/evaluate_hyperparameters_2024-10-17_10-43-19
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-10-17_10-41-16_855805_3146/artifacts/2024-10-17_10-43-19/evaluate_hyperparameters_2024-10-17_10-43-19/driver_artifacts`

Trial status: 1 PENDING
Current time: 2024-10-17 10:43:20. Total running time: 0s
Logical resource usage: 

[36m(pid=4359)[0m 2024-10-17 10:43:25.970514: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=4359)[0m 2024-10-17 10:43:25.991870: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=4359)[0m 2024-10-17 10:43:25.997960: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Trial evaluate_hyperparameters_9fe56_00000 started with configuration:
+----------------------------------------------------------+
| Trial evaluate_hyperparameters_9fe56_00000 config        |
+----------------------------------------------------------+
| num_heads                                              1 |
| num_layers                                             1 |
+----------------------------------------------------------+


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
100%|██████████| 1/1 [00:02<00:00,  2.08s/it]


[36m(evaluate_hyperparameters pid=4359)[0m {'loss': 8.9861, 'grad_norm': 33.80056381225586, 'learning_rate': 0.0, 'epoch': 1.0}


[36m(evaluate_hyperparameters pid=4359)[0m                                              100%|██████████| 1/1 [00:11<00:00,  2.08s/it]100%|██████████| 1/1 [00:11<00:00, 11.91s/it]


[36m(evaluate_hyperparameters pid=4359)[0m {'train_runtime': 11.9087, 'train_samples_per_second': 0.084, 'train_steps_per_second': 0.084, 'train_loss': 8.986088752746582, 'epoch': 1.0}


2024-10-17 10:43:46,601	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/evaluate_hyperparameters_2024-10-17_10-43-19' in 0.0099s.



Trial evaluate_hyperparameters_9fe56_00000 finished iteration 1 at 2024-10-17 10:43:46. Total running time: 26s
+---------------------------------------------------------------+
| Trial evaluate_hyperparameters_9fe56_00000 result             |
+---------------------------------------------------------------+
| checkpoint_dir_name                                           |
| time_this_iter_s                                      17.9076 |
| time_total_s                                          17.9076 |
| training_iteration                                          1 |
| loss                                                  6.90507 |
+---------------------------------------------------------------+

Trial evaluate_hyperparameters_9fe56_00000 completed after 1 iterations at 2024-10-17 10:43:46. Total running time: 26s

Trial status: 1 TERMINATED
Current time: 2024-10-17 10:43:46. Total running time: 26s
Logical resource usage: 2.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-------

Step,Training Loss
1,8.9861




+---------------------------------------------------------------------------------+
| Configuration for experiment     evaluate_hyperparameters_2024-10-17_10-44-05   |
+---------------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                          |
| Scheduler                        AsyncHyperBandScheduler                        |
| Number of trials                 1                                              |
+---------------------------------------------------------------------------------+

View detailed results here: /root/ray_results/evaluate_hyperparameters_2024-10-17_10-44-05
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-10-17_10-41-16_855805_3146/artifacts/2024-10-17_10-44-05/evaluate_hyperparameters_2024-10-17_10-44-05/driver_artifacts`

Trial status: 1 PENDING
Current time: 2024-10-17 10:44:05. Total running time: 0s
Logical resource usage: 

[36m(pid=4638)[0m 2024-10-17 10:44:11.190964: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=4638)[0m 2024-10-17 10:44:11.210936: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=4638)[0m 2024-10-17 10:44:11.217003: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Trial evaluate_hyperparameters_baeff_00000 started with configuration:
+----------------------------------------------------------+
| Trial evaluate_hyperparameters_baeff_00000 config        |
+----------------------------------------------------------+
| num_heads                                              1 |
| num_layers                                             1 |
+----------------------------------------------------------+


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


[36m(evaluate_hyperparameters pid=4638)[0m {'loss': 12.3478, 'grad_norm': 30.250720977783203, 'learning_rate': 0.0, 'epoch': 1.0}


[36m(evaluate_hyperparameters pid=4638)[0m                                              100%|██████████| 1/1 [00:10<00:00,  1.27s/it]100%|██████████| 1/1 [00:10<00:00, 10.61s/it]


[36m(evaluate_hyperparameters pid=4638)[0m {'train_runtime': 10.6171, 'train_samples_per_second': 0.094, 'train_steps_per_second': 0.094, 'train_loss': 12.347847938537598, 'epoch': 1.0}

Trial status: 1 RUNNING
Current time: 2024-10-17 10:44:35. Total running time: 30s
Logical resource usage: 2.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+------------------------------------------------------------------------------+
| Trial name                             status       num_heads     num_layers |
+------------------------------------------------------------------------------+
| evaluate_hyperparameters_baeff_00000   RUNNING              1              1 |
+------------------------------------------------------------------------------+


2024-10-17 10:44:40,420	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/evaluate_hyperparameters_2024-10-17_10-44-05' in 0.0047s.



Trial evaluate_hyperparameters_baeff_00000 finished iteration 1 at 2024-10-17 10:44:40. Total running time: 35s
+---------------------------------------------------------------+
| Trial evaluate_hyperparameters_baeff_00000 result             |
+---------------------------------------------------------------+
| checkpoint_dir_name                                           |
| time_this_iter_s                                      20.6418 |
| time_total_s                                          20.6418 |
| training_iteration                                          1 |
| loss                                                  11.6834 |
+---------------------------------------------------------------+

Trial evaluate_hyperparameters_baeff_00000 completed after 1 iterations at 2024-10-17 10:44:40. Total running time: 35s

Trial status: 1 TERMINATED
Current time: 2024-10-17 10:44:40. Total running time: 35s
Logical resource usage: 2.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-------



Step,Training Loss
1,10.8178


In [None]:
for model_name, loss in comparison_dict.items():
  print(f"Model: {model_name}, Loss (cross entropy): {loss}")

Model: microsoft/DialoGPT-small_base_model, Loss (cross entropy): 6.905068397521973
Model: microsoft/DialoGPT-small_custom_layer_num_heads_1_num_layers_1, Loss (cross entropy): 11.683367729187012


In [None]:
!rm -rf ./results/
!pip install datasets



In [None]:
model_name = "emre/spanish-dialoGPT"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

conversaciones = [
    {"input": "¡Hola!", "output": "Hola, ¿cómo puedo ayudarte?"},
]

conversaciones_val = [
    {"input": "¡Buenos días!", "output": "Buenos días, ¿cómo puedo ayudarte?"},
]

max_length = get_max_length(tokenizer, conversaciones + conversaciones_val)

train_dataset = preprocess_conversations(tokenizer, conversaciones, max_length)
val_dataset = preprocess_conversations(tokenizer, conversaciones_val, max_length)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
)

# Definir el trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)




In [None]:
# Entrenar el modelo
trainer.train()

# Paso 4: Guardar el modelo entrenado localmente
model.save_pretrained("./modelo-ajustado")
tokenizer.save_pretrained("./modelo-ajustado")

# Paso 5: Cargar el modelo guardado
modelo_ajustado = AutoModelForCausalLM.from_pretrained("./modelo-ajustado")
tokenizer_ajustado = AutoTokenizer.from_pretrained("./modelo-ajustado")

def interactuar(prompt: str) -> str:
  input_ids = tokenizer_ajustado.encode(prompt, return_tensors="pt")
  with torch.no_grad():
    output = modelo_ajustado.generate(input_ids, min_length=20, max_length=100, pad_token_id=tokenizer_ajustado.eos_token_id)
  response = tokenizer_ajustado.decode(output[0], skip_special_tokens=True)
  return response



Epoch,Training Loss,Validation Loss
1,No log,12.869484


In [None]:
# Interactuar con el modelo
prompt = "Estoy trabajando en"
respuesta = interactuar(prompt)
print(respuesta)

Estoy trabajando en una bendita paz. ¿Qué pasa?


In [None]:
def plot_comparison(comparison_dict: Dict[str, Tuple[float, float]]) -> None:
  names: List[str] = [name for name in comparison_dict.keys()]
  losses: List[float] = [loss_tuple[0] for loss_tuple in comparison_dict.values()]
  accuracies: List[float] = [math.exp(loss_tuple[0]) for loss_tuple in comparison_dict.values()]

  plt.figure(figsize=(5, 5))
  plt.scatter(losses, accuracies)
  for i, name in enumerate(names):
    plt.annotate(name, (losses[i], accuracies[i]))
  plt.xlabel("Val loss (cross entropy)")
  plt.ylabel("Accuracy")
  plt.title("Model comparison")
  plt.show()

plot_comparison(comparison_dict)

In [None]:
from huggingface_hub import InferenceClient

token = "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"
client = InferenceClient(api_key=token)

In [None]:
# este modelo es 90 veces más grande que spanish-dialogpt
for message in client.chat_completion(
	model="mistralai/Mistral-Nemo-Instruct-2407",
	messages=[{"role": "user", "content": "I a going to commit suicide"}],
	max_tokens=50,
	stream=True,
):
    print(message.choices[0].delta.content, end="")

I'm really sorry that you're feeling this way, but I'm here to help. Please tell me what's been troubling you. There are people who care about you and want you to be safe. Here are some resources that can help:

