In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling 
from datasets import load_dataset, concatenate_datasets 
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#model_name = "EleutherAI/pythia-70m"
model_name = "Qwen/Qwen2-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto') # load_in_4bit=True
print(model)

# Configuración de LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    r=8,  # rango de LoRA
    lora_alpha=32,  # hiperparámetro de LoRA
    lora_dropout=0.1,  # dropout de LoRA
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    #target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
    #target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']  # módulos objetivo para aplicar LoRA
)

model = get_peft_model(model, lora_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear

In [3]:
# Preparación del dataset tipo 'instruct'
"""train_dataset_instruct = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
def tokenize_function_instruct(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

tokenized_train_dataset = train_dataset_instruct.map(tokenize_function_instruct, batched=True, remove_columns=['text'])"""

'train_dataset_instruct = load_dataset(\'json\', data_files=\'./data/arithmatic_expressions.json\', split="train")\ndef tokenize_function_instruct(examples):\n    return tokenizer(examples[\'text\'], padding=\'max_length\', truncation=True, max_length=64)\n\ntokenized_train_dataset = train_dataset_instruct.map(tokenize_function_instruct, batched=True, remove_columns=[\'text\'])'

In [4]:
#peparacion dataset tecnologia y deporte
train_dataset_instruct = load_dataset('json', data_files='./data/sports_phrases.json', split="train")
train_dataset_instruct2 = load_dataset('json', data_files='./data/technology_phrases.json', split="train")
#join
train_dataset_instruct = concatenate_datasets([train_dataset_instruct, train_dataset_instruct2])
def tokenize_function_instruct(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

tokenized_train_dataset = train_dataset_instruct.map(tokenize_function_instruct, batched=True, remove_columns=['text'])

Generating train split: 500 examples [00:00, 36898.95 examples/s]
Generating train split: 500 examples [00:00, 122304.31 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 28411.30 examples/s]


In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=1e-4,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=10_000,
    report_to="wandb"
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer_instruct = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=valid_dataset_mapped,   
    # dataset_text_field="text",
    max_seq_length=64,
    tokenizer=tokenizer,
    args=training_args,
    # compute_metrics=compute_metrics_fn,
    packing=True,
    data_collator=data_collator,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [6]:
trainer_instruct.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m. Use [1m`wandb login --relogin`[0m to force relogin


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss,Validation Loss


TrainOutput(global_step=2500, training_loss=0.6521234756469727, metrics={'train_runtime': 287.2986, 'train_samples_per_second': 17.403, 'train_steps_per_second': 8.702, 'total_flos': 2533580144640000.0, 'train_loss': 0.6521234756469727, 'epoch': 5.0})

In [7]:
# Guardar el modelo fine-tuneado
model_file = os.path.join("models", "fine_tuned_model_st_qwen")
trainer_instruct.save_model(model_file)
tokenizer.save_pretrained(model_file)

('models/fine_tuned_model_st_qwen/tokenizer_config.json',
 'models/fine_tuned_model_st_qwen/special_tokens_map.json',
 'models/fine_tuned_model_st_qwen/vocab.json',
 'models/fine_tuned_model_st_qwen/merges.txt',
 'models/fine_tuned_model_st_qwen/added_tokens.json',
 'models/fine_tuned_model_st_qwen/tokenizer.json')