# **Fine tuning de un modelo combinando auto-regresión y entrenamiento tipo 'instruct'**

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
import wandb
import os

In [3]:
os.environ['WANDB_NOTEBOOK_NAME'] = 'LanguageModeling.ipynb'
EPOCHS = 20

In [4]:
# Configuración del modelo y tokenizer
# model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
# model_name = "Qwen/Qwen2-1.5B-Instruct"
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto') # load_in_4bit=True
print(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [5]:

############## DATASETS ##############

# Preparación del dataset auto-regresivo
dataset_autoregressive = load_dataset('text', data_files={'train': 'data/sentences_train.txt'})
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

tokenized_datasets_autoregressive = dataset_autoregressive.map(tokenize_function_autoregressive, batched=True, remove_columns=['text'])

# Preparación del dataset tipo 'instruct'
train_dataset_instruct = load_dataset('json', data_files='./data/questions_train.jsonl', split="train")
def tokenize_function_instruct(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

tokenized_train_dataset = train_dataset_instruct.map(tokenize_function_instruct, batched=True, remove_columns=['text'])

wandb.init(
    project='LM',
    # name='iris-' + str(model_config['hidden_size']) + '-hidden_' + str(model_config['learning_rate']) + '-lr',
    # config=model_config
)

############## MODELO ##############

# Configuración de LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    r=8,  # rango de LoRA
    lora_alpha=32,  # hiperparámetro de LoRA
    lora_dropout=0.1,  # dropout de LoRA
    #target_modules=[
    #    "q_proj",
    #    "k_proj",
    #    "v_proj",
    #    "o_proj",
    #    "gate_proj",
    #    "up_proj",
    #    "down_proj",
    #],
    target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
    #target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']  # módulos objetivo para aplicar LoRA
)

model = get_peft_model(model, lora_config)


############## ENTRENAMIENTO ##############

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=1e-4,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=10_000,
    report_to="wandb"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Configuración del Trainer
trainer_auto = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_autoregressive['train'],
    data_collator=data_collator
)

# Configuración del SFTTrainer
trainer_instruct = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=valid_dataset_mapped,   
    # dataset_text_field="text",
    max_seq_length=64,
    tokenizer=tokenizer,
    args=training_args,
    # compute_metrics=compute_metrics_fn,
    packing=True,
    data_collator=data_collator,
)


Map: 100%|██████████| 50/50 [00:00<00:00, 2433.54 examples/s]
[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m. Use [1m`wandb login --relogin`[0m to force relogin



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [6]:
for _ in range(EPOCHS):
    trainer_auto.train()
    trainer_instruct.train()



Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Función para generar texto
def generate_text(prompt, max_length=100, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.01
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [8]:
# Ejemplo de uso
prompt = "Judit tiene el teléfono" #606933660
generated_texts = generate_text(prompt, max_length=35, num_return_sequences=1)

# Imprimir los resultados generados
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:\n{text}\n")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated Text 1:
Judit tiene el teléfono 928663803. </ul>
</ul>
</ul>

Aqui el teléfono de



In [9]:
# Guardar el modelo fine-tuneado
trainer_instruct.save_model("./fine_tuned_model_both_pythia")
tokenizer.save_pretrained("./fine_tuned_model_both_pythia")

('./fine_tuned_model_both_pythia/tokenizer_config.json',
 './fine_tuned_model_both_pythia/special_tokens_map.json',
 './fine_tuned_model_both_pythia/tokenizer.json')