# <center> ================================ </center>
# <center> Fine-tuning Gemma-3 (1B) con Unsloth </center>
# <center> ================================ </center>

In [1]:
# Importando las librerías necesarias
# FastModel es la clase principal de Unsloth para cargar y gestionar modelos.
from unsloth import FastModel
import torch

# Cargando el modelo y el tokenizador
# from_pretrained descarga o carga un modelo preentrenado desde el hub.
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it", # Nombre del modelo base
    max_seq_length = 1024, # Longitud máxima de secuencia para el contexto de entrada (la cadena SMILES más larga es de 800 caracteres)
    load_in_4bit = True,  # Cuantización a 4 bits para reducir el uso de memoria
    load_in_8bit = False, # Cuantización a 8 bits (un poco más precisa, pero 2x de memoria)
    full_finetuning = False, # Si es False, se entrena sólo una parte del modelo (LoRA u otros métodos)
    token = "", # Token de autenticación para modelos restringidos 
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.7.3: Fast Gemma3 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 3070 Laptop GPU. Num GPUs = 1. Max memory: 8.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth requiere GPU --> Verificando que efectivamente esté disponible

In [2]:
print("CUDA disponible:", torch.cuda.is_available())
print("Versión CUDA compilada:", torch.version.cuda)
print("Dispositivo:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No detectado")

CUDA disponible: True
Versión CUDA compilada: 12.8
Dispositivo: NVIDIA GeForce RTX 3070 Laptop GPU


### Configuración del modelo para PEFT (Parameter Efficient Fine-Tuning)

In [3]:
# Con get_peft_model aplicamos la técnica LoRA (Low-Rank Adaptation)
# Agregando adaptadores LoRA ajustamos sólo algunos parámetros
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Desactivar si se usa sólo para texto
    finetune_language_layers   = True,  # Debe quedar activo
    finetune_attention_modules = True,  # Se ajustan las capas de atención (good for GRPO)
    finetune_mlp_modules       = True,  # Debe quedar activo siempre 

    # Hiperparámetros de LoRA
    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


<a name="Data"></a>
### Preparación de los datos
Se usa el formato de `Gemma-3` para ajustar los estilos de conversación

In [4]:
# `get_chat_template` obtiene el template del chat correcto
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [5]:
# Cargando el mismo dataset que el utilizado para entrenar/validar/testear modelos de DL basados en secuencias
import pandas as pd
train_set = pd.read_csv('./Datasets/train_set-ANT.csv', encoding="utf-8")
test_set = pd.read_csv('./Datasets/test_set-ANT.csv', encoding="utf-8")
val_set = pd.read_csv('./Datasets/val_set-ANT.csv', encoding="utf-8")

In [6]:
X_train = train_set['Neutralized SMILES']
y_train = train_set['ATC Codes']
X_test = test_set['Neutralized SMILES']
y_test = test_set['ATC Codes']
X_val = val_set['Neutralized SMILES']
y_val = val_set['ATC Codes']

In [7]:
# Corrección de cadenas SMILES para evitar errores
for i in range(len(X_train)):
    smiles = X_train.at[i]
    s_smiles = smiles.replace("\\", "\\\\")
    X_train.at[i] = s_smiles
for i in range(len(X_test)):
    smiles = X_test.at[i]
    s_smiles = smiles.replace("\\", "\\\\")
    X_test.at[i] = s_smiles
for i in range(len(X_val)):
    smiles = X_val.at[i]
    s_smiles = smiles.replace("\\", "\\\\")
    X_val.at[i] = s_smiles

In [8]:
X_train

0       COc1cc2nc(N3CCN(C(=O)C4COc5ccccc5O4)CC3)nc(N)c...
1                 CC(C)(C)NC[C@@H](O)COc1cccc2c1CCC(=O)N2
2                             O=P(O)(O)C(Cl)(Cl)P(=O)(O)O
3                               Nc1cc(-c2ccncc2)c[nH]c1=O
4               CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21
                              ...                        
2434                              CC(C)c1ccccc1OCC1=NCCN1
2435       OC[C@H]1O[C@H](S[Au])[C@H](O)[C@@H](O)[C@@H]1O
2436    CN(C)c1cc(CNCC(C)(C)C)c(O)c2c1C[C@H]1C[C@H]3[C...
2437    Nc1nc(F)nc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)O)[C@...
2438        C[C@@H](NCCCc1cccc(C(F)(F)F)c1)c1cccc2ccccc12
Name: Neutralized SMILES, Length: 2439, dtype: object

In [9]:
# Creación del dataset para realizar el fine-tuning
# Se convierten los datos de entrenamiento en un formato tipo "conversación", compatible con Gemma-3
from datasets import Dataset
data  = []
for SMILES, ATCcode in zip(X_train, y_train):
    data.append({
        'conversations': [
            {'from': 'human', 'value': SMILES}, # Entrada del humano: SMILES de la molécula
            {'from': 'gpt', 'value': ATCcode} # Salida del gpt: lista de códigos ATC correspondientes
        ],
        'source': 'dataset_train',  
        'score': 1.0 
    })

dataset_train = Dataset.from_list(data)
print(dataset_train)

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 2439
})


In [10]:
# Se convierten los datos de testeo en un formato tipo "conversación", compatible con Gemma-3
data_test  = []
for SMILES, ATCcode in zip(X_test, y_test):
    data_test.append({
        'conversations': [
            {'from': 'human', 'value': SMILES},
            {'from': 'gpt', 'value': ATCcode}
        ],
        'source': 'dataset_test',  
        'score': 1.0 
    })

dataset_test = Dataset.from_list(data_test)
print(dataset_test)

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 719
})


In [11]:
# Se convierten los datos de validación en un formato tipo "conversación", compatible con Gemma-3
data_val  = []
for SMILES, ATCcode in zip(X_val, y_val):
    data_val.append({
        'conversations': [
            {'from': 'human', 'value': SMILES},
            {'from': 'gpt', 'value': ATCcode}
        ],
        'source': 'dataset_val',  
        'score': 1.0 
    })

dataset_val = Dataset.from_list(data_val)
print(dataset_val)

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 431
})


In [12]:
dataset_train[4]

{'conversations': [{'from': 'human',
   'value': 'CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21'},
  {'from': 'gpt', 'value': 'J01MA; J01RA; S01AE'}],
 'source': 'dataset_train',
 'score': 1.0}

### Estandarización del dataset

In [13]:
# Unsloth necesita que los datos sigan el formato estándar de chat del modelo
# Usamos `standardize_data_formats` para convertir los datasets al formato correcto para realizar el fine-tuning
from unsloth.chat_templates import standardize_data_formats
dataset_train = standardize_data_formats(dataset_train)

Unsloth: Standardizing formats (num_proc=16): 100%|█████████████████████████| 2439/2439 [00:43<00:00, 56.25 examples/s]


In [14]:
dataset_test = standardize_data_formats(dataset_test)

Unsloth: Standardizing formats (num_proc=16): 100%|███████████████████████████| 719/719 [00:44<00:00, 16.05 examples/s]


In [15]:
dataset_val = standardize_data_formats(dataset_val)

Unsloth: Standardizing formats (num_proc=16): 100%|███████████████████████████| 431/431 [00:43<00:00,  9.89 examples/s]


In [16]:
dataset_train[4]

{'conversations': [{'content': 'CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21',
   'role': 'user'},
  {'content': 'J01MA; J01RA; S01AE', 'role': 'assistant'}],
 'source': 'dataset_train',
 'score': 1.0}

### Formateo de los prompts para el modelo

In [17]:
# Ahora tenemos que aplicar el template del chat para `Gemma-3` en las conversaciones, y guardarlo como texto plano.
# Removemos el token `<bos>` usando removeprefix(`'<bos>'`) dado que estamos haciendo finetuning (se va a añadir este token antes del entrenamiento y el modelo espera sólo uno)
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset_train = dataset_train.map(formatting_prompts_func, batched = True)
dataset_test = dataset_test.map(formatting_prompts_func, batched = True)

Map: 100%|███████████████████████████████████████████████████████████████| 2439/2439 [00:00<00:00, 12891.95 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 719/719 [00:00<00:00, 17304.29 examples/s]


In [18]:
dataset_val = dataset_val.map(formatting_prompts_func, batched = True)

Map: 100%|█████████████████████████████████████████████████████████████████| 431/431 [00:00<00:00, 13434.59 examples/s]


Resultado al aplicar el template del chat

In [19]:
dataset_train[100]["text"]

'<start_of_turn>user\nCC1(C)SC2C(NC(=O)C(C(=O)O)c3ccsc3)C(=O)N2C1C(=O)O<end_of_turn>\n<start_of_turn>model\nJ01CA<end_of_turn>\n'

<a name="Train"></a>
### Entrenamiento del modelo
Usamos `SFTTrainer` de la librería TRL (Transformer Reinforcement Learning) de Huggingface [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). 

In [20]:
def formatting_func(example):
    return example["text"]

In [21]:
dataset_val[0]["text"]

'<start_of_turn>user\nCCCCN1CCCCC1C(=O)Nc1c(C)cccc1C<end_of_turn>\n<start_of_turn>model\nN01BB<end_of_turn>\n'

In [29]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [31]:
original_map = Dataset.map

def patched_map(self, *args, **kwargs):
    kwargs["num_proc"] = 1
    return original_map(self, *args, **kwargs)

Dataset.map = patched_map

In [32]:
# Configuración del entrenamiento con SFTTrainer
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    eval_dataset = dataset_val, # Can set up evaluation!
    formatting_func = formatting_func,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        #max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"]: 100%|██████████████████████████████████████| 2439/2439 [00:00<00:00, 16456.32 examples/s]
Unsloth: Tokenizing ["text"]: 100%|████████████████████████████████████████| 431/431 [00:00<00:00, 16572.65 examples/s]


In [33]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3070 Laptop GPU. Max memory = 8.0 GB.
1.311 GB of memory reserved.


### Entrenamiento del modelo

In [34]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,439 | Num Epochs = 3 | Total steps = 915
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,522,880 of 1,006,408,832 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,8.8974
2,8.3424
3,6.2907
4,7.4828
5,7.5773
6,6.6312
7,6.3078
8,5.3824
9,4.4264
10,4.3883


In [35]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2385.836 seconds used for training.
39.76 minutes used for training.
Peak reserved memory = 1.395 GB.
Peak reserved memory for training = 0.084 GB.
Peak reserved memory % of max memory = 17.438 %.
Peak reserved memory for training % of max memory = 1.05 %.


<a name="Inference"></a>
### Inferencia
Vamos a ejecutar el modelo mediante la inferencia nativa de Unsloth. Según el equipo de Gemma-3, los ajustes recomendados para la inferencia son temperatura = 1.0, top_p = 0.95, top_k = 64

In [36]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : X_test[1], #N06BX
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, 
    tokenize=False # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><bos><start_of_turn>user\nCn1c(=O)c2c(ncn2CCNCC(O)c2ccc(O)c(O)c2)n(C)c1=O<end_of_turn>\n<start_of_turn>model\nP01AA<end_of_turn>']

In [37]:
print(X_test[1])
print(y_test[1])

Cn1c(=O)c2c(ncn2CCNCC(O)c2ccc(O)c(O)c2)n(C)c1=O
C01CA


In [38]:
X_test

0      CN(C)c1ccc(O)c2c1C[C@H]1C[C@H]3[C@H](N(C)C)C(O...
1        Cn1c(=O)c2c(ncn2CCNCC(O)c2ccc(O)c(O)c2)n(C)c1=O
2              CC(C)=CCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O
3         OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1
4      COc1cc(N)c(Cl)cc1C(=O)NC1CCN(CCCOc2ccc(F)cc2)C...
                             ...                        
714                CNS(=O)(=O)Cc1ccc2[nH]cc(CCN(C)C)c2c1
715    CO/N=C(\\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(COC(N...
716    CCCCc1oc2ccc(NS(C)(=O)=O)cc2c1C(=O)c1ccc(OCCCN...
717                         CN/C(=N/C#N)NCCSCc1nc[nH]c1C
718    CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccc(N(C)...
Name: Neutralized SMILES, Length: 719, dtype: object

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [39]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : X_test[1],}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    tokenize=False # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

L01CA<end_of_turn>


In [40]:
print(X_test[1])
print(y_test[1])

Cn1c(=O)c2c(ncn2CCNCC(O)c2ccc(O)c(O)c2)n(C)c1=O
C01CA


<a name="Save"></a>
### Guardando y cargando los modelos finetuned
Para guardar el modelo final como adaptadores LoRA, utiliza `save_pretrained` para guardarlo localmente.

Esto SOLO guarda los adaptadores LoRA, y no el modelo completo.

In [41]:
model.save_pretrained("4gemma-31")  # Local saving
tokenizer.save_pretrained("4gemma-31")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

('4gemma-31\\tokenizer_config.json',
 '4gemma-31\\special_tokens_map.json',
 '4gemma-31\\chat_template.jinja',
 '4gemma-31\\tokenizer.model',
 '4gemma-31\\added_tokens.json',
 '4gemma-31\\tokenizer.json')

### Guardar en float16 para VLLM

También podemos guardar directamente en `float16` para su despliegue.

In [42]:
if True: 
    model.save_pretrained_merged("4gemma-31-finetune", tokenizer)

Found HuggingFace hub cache directory: C:\Users\trini\.cache\huggingface\hub
Checking cache directory for required files...
Successfully copied all 1 files from cache to 4gemma-31-finetune.


Unsloth: Merging weights into 16bit: 100%|███████████████████████████████████████████████| 1/1 [00:06<00:00,  6.46s/it]


### Conversión GGUF / llama.cpp
Para guardar en `GGUF` / `llama.cpp`, unsloth lo soporta forma nativa para todos los modelos.

Se convierte fácilmente a `Q8_0, F16 o BF16` precision.

In [43]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "4gemma-31-finetune",
        quantization_type = "q8_0",
    )
    model.save_pretrained_gguf(
        "4gemma-31-finetune",
        quantization_type = "f16", 
    )

Unsloth GGUF:hf-to-gguf:Loading model: 4gemma-31-finetune
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3ForCausalLM
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model.safetensors'
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> Q8_0, shape = {1152, 262144}
Unsloth GGUF:hf-to-gguf:output_norm.weight,                torch.bfloat16 --> F32, shape = {1152}
Unsloth GGUF:hf-to-gguf:Set meta model
Unsloth GGUF:hf-to-gguf:Set model parameters
Unsloth GGUF:hf-to-gguf:Set model quantization version
Unsloth GGUF:hf-to-gguf:Set model tokenizer
Unsloth GGUF:gguf.vocab:Setting special token type bos to 2
Unsloth GGUF:gguf.vocab:Setting special token type eos to 106
Unsloth GGUF:gguf.vocab:Setting special token type unk to 3
Unsloth GGUF:gguf.vocab:Setting special token type pad to 0
Unsloth GGUF:gguf.vocab:Setting add_bos_token to True
Unsloth

Unsloth: GGUF conversion: 100%|█████████████████████████████████████████| 100/100 [00:14<00:00,  3.50it/s, 1.06G/1.06G]

Unsloth GGUF:hf-to-gguf:Model successfully exported to .\


Unsloth: GGUF conversion: 100%|█████████████████████████████████████████| 100/100 [00:14<00:00,  6.82it/s, 1.06G/1.06G]

Unsloth: Converted to 4gemma-31-finetune.Q8_0.gguf with size = 1.1G





Unsloth: Successfully saved GGUF to:
4gemma-31-finetune.Q8_0.gguf
Unsloth GGUF:hf-to-gguf:Loading model: 4gemma-31-finetune
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3ForCausalLM
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model.safetensors'
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> F16, shape = {1152, 262144}
Unsloth GGUF:hf-to-gguf:output_norm.weight,                torch.bfloat16 --> F32, shape = {1152}
Unsloth GGUF:hf-to-gguf:Set meta model
Unsloth GGUF:hf-to-gguf:Set model parameters
Unsloth GGUF:hf-to-gguf:Set model quantization version
Unsloth GGUF:hf-to-gguf:Set model tokenizer
Unsloth GGUF:gguf.vocab:Setting special token type bos to 2
Unsloth GGUF:gguf.vocab:Setting special token type eos to 106
Unsloth GGUF:gguf.vocab:Setting special token type unk to 3
Unsloth GGUF:gguf.vocab:Setting special token type pad t

Unsloth: GGUF conversion: 100%|█████████████████████████████████████████| 100/100 [00:05<00:00,  8.42it/s, 2.00G/2.00G]

Unsloth GGUF:hf-to-gguf:Model successfully exported to .\


Unsloth: GGUF conversion: 100%|█████████████████████████████████████████| 100/100 [00:05<00:00, 17.08it/s, 2.00G/2.00G]

Unsloth: Converted to 4gemma-31-finetune.F16.gguf with size = 2.0G
Unsloth: Successfully saved GGUF to:
4gemma-31-finetune.F16.gguf



