# Practica 1

## Instalación de dependencias

In [1]:
!pip install -q bitsandbytes datasets accelerate peft transformers torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q safetensors

## Importaciones

In [3]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

## Preparación del dataset

In [4]:
# Cargar dataset (cambia aquí para tu propio dataset)
data = load_dataset("glue", "sst2")

def merge_columns(example):
    label = "POSITIVE" if example["label"] == 1 else "NEGATIVE"
    example['input'] = example["sentence"] + " ->: "
    example['output'] = label
    example['prediction'] = example['input'] + example['output']
    return example


data['train'] = data['train'].map(merge_columns)

# Para experimentar: Si subes tu CSV, reemplaza lo de arriba con:
# from google.colab import files
# uploaded = files.upload()
# import pandas as pd
# from datasets import Dataset
# df = pd.read_csv(list(uploaded.keys())[0])  # Asume columnas 'text' y 'output'
# data = {'train': Dataset.from_pandas(df.rename(columns={'text': 'sentence', 'output': 'label'}))}
# # Luego aplica merge_columns como arriba

print("Ejemplo de datos formateados:")
print(data['train'][0]['prediction'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Ejemplo de datos formateados:
hide new secretions from the parental units  ->: NEGATIVE


## Carga del modelo y tokenizer

In [5]:
model_name = "deepseek-ai/deepseek-coder-1.3b-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map='auto',
)

# Configuraciones básicas
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

## Congelar parámetros del modelo

In [6]:
for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

## Verificar parámetros entrenables

In [7]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

trainable params: 0 || all params: 1346471936 || trainable%: 0.0


## Identificar módulos objetivo para LoRA

In [8]:
for name, module in model.named_modules():
    if 'attn' in name.lower() or 'attention' in name.lower():
        print(name)
        for sub_name, sub_module in module.named_modules():
            if 'proj' in sub_name:
                print(f"  - {sub_name}")

model.layers.0.self_attn
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.post_attention_layernorm
model.layers.1.self_attn
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.post_attention_layernorm
model.layers.2.self_attn
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.self_attn.o_proj
model.layers.2.post_attention_layernorm
model.layers.3.self_attn
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.3.self_attn.q_proj
model.layers.3.self_attn.k_proj
model.layers.3.self_attn.v_proj
model.layers.3.self_attn.o_proj
model.layers.3.post_attention_layernorm
model.layers.4.self_attn
  - q_proj
  - k_proj
  - v

## Setup de LoRA

In [9]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 3145728 || all params: 1349617664 || trainable%: 0.2330829007288393


## Tokenización del dataset

In [10]:
def tokenize_function(examples):
    return tokenizer(examples['prediction'], truncation=True, padding='max_length', max_length=128)

tokenized_data = data['train'].map(tokenize_function, batched=True, remove_columns=data['train'].column_names)

tokenized_data.set_format("torch", columns=["input_ids", "attention_mask"], dtype=torch.long)

print("Ejemplo de datos tokenizados (dtype):", tokenized_data[0]['input_ids'].dtype)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Ejemplo de datos tokenizados (dtype): torch.int64


## Setup del entrenamiento

In [11]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_steps=10,
    max_steps=500,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir='outputs',
    optim="paged_adamw_8bit",
    save_steps=500,
    report_to="none",
)


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

model.config.use_cache = False

trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_args,
    data_collator=data_collator,
)

## Entrenamiento

In [12]:
trainer.train()



Step,Training Loss
10,5.0405
20,3.91
30,2.9746
40,2.8418
50,2.7667
60,2.706
70,2.6064
80,2.6678
90,2.6519
100,2.585


TrainOutput(global_step=500, training_loss=2.6400278511047364, metrics={'train_runtime': 389.1438, 'train_samples_per_second': 5.139, 'train_steps_per_second': 1.285, 'total_flos': 1971544129536000.0, 'train_loss': 2.6400278511047364, 'epoch': 0.029694738092410026})

## Guardar el modelo LoRA

In [13]:
model.save_pretrained("./lora_model")
tokenizer.save_pretrained("./lora_model")
print("Modelo LoRA guardado en ./lora_model")

Modelo LoRA guardado en ./lora_model


## Inferencia simple (evaluación)

In [15]:
from peft import PeftModel

# Cargar modelo base + LoRA guardado
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, "./lora_model")
tokenizer = AutoTokenizer.from_pretrained("./lora_model")

# Prompt de prueba (similar al formato de entrenamiento)
prompt = "This film was excellent ->: "
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

# Generar
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.7)

prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Predicción:", prediction)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Predicción: This film was excellent ->:  ->: POSITIVE  ->


# Sin usar Fine Tunning

## Preparacion del dataset

In [16]:
from datasets import load_dataset

data = load_dataset("glue", "sst2")

eval_dataset = data["validation"]

print(eval_dataset[0])


{'sentence': "it 's a charming and often affecting journey . ", 'label': 1, 'idx': 0}


## Formatear los datos (prompt + etiqueta esperada)

In [17]:
def format_example(example):
    label = "POSITIVE" if example["label"] == 1 else "NEGATIVE"
    example["input"] = example["sentence"] + " ->: "
    example["gold"] = label
    return example

eval_dataset = eval_dataset.map(format_example)

print(eval_dataset[0])


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

{'sentence': "it 's a charming and often affecting journey . ", 'label': 1, 'idx': 0, 'input': "it 's a charming and often affecting journey .  ->: ", 'gold': 'POSITIVE'}


## Carga del modelo y tokenizer

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "deepseek-ai/deepseek-coder-1.3b-base"  # modelo base
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")


## Función para generar predicciones

In [19]:
import torch

def predict(example):
    prompt = example["input"]
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=20)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    completion = generated.split("->:")[-1].strip()

    if "POS" in completion.upper():
        pred = 1
    elif "NEG" in completion.upper():
        pred = 0
    else:
        pred = -1

    return {"prompt": prompt, "generated": generated, "pred": pred, "gold": example["label"]}


## Validar

In [20]:
for i in range(5):
    result = predict(eval_dataset[i])
    print("Prompt:", result["prompt"])
    print("Generated:", result["generated"])
    print(f"Pred: {result['pred']}, Gold: {result['gold']}")
    print("-" * 60)


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Prompt: it 's a charming and often affecting journey .  ->: 
Generated: it 's a charming and often affecting journey .  ->: 1' do
  expect(1).to eq(1)
end

it '
Pred: -1, Gold: 1
------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Prompt: unflinchingly bleak and desperate  ->: 
Generated: unflinchingly bleak and desperate  ->: 




















Pred: -1, Gold: 0
------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Prompt: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .  ->: 
Generated: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .  ->: 




















Pred: -1, Gold: 1
------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Prompt: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .  ->: 
Generated: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .  ->: 




















Pred: -1, Gold: 1
------------------------------------------------------------
Prompt: it 's slow -- very , very slow .  ->: 
Generated: it 's slow -- very , very slow .  ->:  very , very slow .' do
    expect(
      's slow -- very , very
Pred: -1, Gold: 0
------------------------------------------------------------
