# LoRA finetuning for Qwen 2.5 14B

## Start with setup of environment

In [1]:
import sys
# Block torchvision from being imported
sys.modules["torchvision"] = None 
# Block tensorflow as well to avoid the numpy errors from earlier
sys.modules["tensorflow"] = None 

In [2]:
from datasets import load_dataset, concatenate_datasets
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer
import gc
import wandb

## For Logging purposes

In [3]:
wandb.init(project="Qwen-fine-tuning", name="14b-ioc-extraction")

[34m[1mwandb[0m: Currently logged in as: [33mt-p-angevare[0m ([33mt-p-angevare-university-of-twente[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
gc.collect()
torch.cuda.empty_cache()

## Usage of AI4privacy dataset cleaning and implementation

https://huggingface.co/datasets/ai4privacy/pii-masking-300k 

In [5]:
dataset = load_dataset("ai4privacy/pii-masking-300k")
dataset = dataset.filter(lambda x: x['language'] == 'English')
dataset = dataset.select_columns(["source_text", "privacy_mask", "id"])
dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'id'],
        num_rows: 29908
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'id'],
        num_rows: 7946
    })
})

In [6]:
dataset_entity_mapping = {
    'EMAIL' : 'EMAIL',
    'LASTNAME1' : 'PERSON',
    'IP' : 'IP',
    'GIVENNAME1' : 'PERSON',
    'TEL' : 'PHONE',
    'CITY' : 'LOCATION',
    'STATE' : 'LOCATION',
    'COUNTRY' : 'LOCATION',
}

In [7]:
def clean_entities(privacy_mask, source_text):
    """
    Clean and combine entities, merging consecutive PERSON entities into full names.
    """
    entities = []
    for ent in privacy_mask:
        if ent['label'] in dataset_entity_mapping.keys():
            entities.append({
                'type': dataset_entity_mapping.get(ent['label']),
                'text': ent['value'],
                'start_pos': ent['start'],
                'end_pos': ent['end'],
                'original_label': ent['label']
            })
    
 
    entities.sort(key=lambda x: x['start_pos'])

    merged_entities = []
    i = 0
    while i < len(entities):
        current = entities[i]
        
        if current['type'] == 'PERSON' and i + 1 < len(entities):
            next_ent = entities[i + 1]
            
            if (next_ent['type'] == 'PERSON' and 
                next_ent['start_pos'] - current['end_pos'] <= 3):

                is_first_given = 'GIVENNAME' in current['original_label']
                is_second_last = 'LASTNAME' in next_ent['original_label']
                is_first_last = 'LASTNAME' in current['original_label']
                is_second_given = 'GIVENNAME' in next_ent['original_label']
                
                if (is_first_given and is_second_last) or (is_first_last and is_second_given):
                    # Merge into full name
                    full_name = source_text[current['start_pos']:next_ent['end_pos']]
                    merged_entities.append({
                        'entity': full_name,
                        'type': 'PERSON',
                    })
                    i += 2 
                    continue
        
  
        merged_entities.append({
            'entity': current['text'],
            'type': current['type'],

        })
        i += 1
    
    return merged_entities

In [8]:
dataset = dataset.map(lambda x: {'privacy_mask': clean_entities(x['privacy_mask'], x['source_text'])})

## Set up LLM

In [9]:
prompt = """
Your task is to extract any entity from the input text. For each entity found you MUST indicate the type in UPPERCASE. ONLY extract entities if literal entity is present in input text.
The expected entity types are the following: EMAIL, IP, PERSON, LOCATION, PHONE

The output MUST be in a JSON object with key 'entities' and the value a list of dictionaries including every entity found. For each entity you MUST indicate the type in UPPERCASE.
"""

In [10]:
model_name = "Qwen/Qwen2.5-14B" 
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
import json

def convert_to_chatml(source_text, privacy_mask):
    # Convert to the JSON format expected by the prompt
    entities = []
    for ent in privacy_mask:
        entities.append({"entity": ent['entity'], "type": ent['type']})
    
    entities_json = {
        "entities": entities
    }
    
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": source_text},
        {"role": "assistant", "content": json.dumps(entities_json, indent=2)}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)
    

In [12]:

dataset = dataset.map(lambda x: {"text": convert_to_chatml(x['source_text'], x['privacy_mask'])})

In [13]:
import pandas as pd
df_lengths = pd.Series([len(lst) for lst in dataset['train']['privacy_mask']])
token_lengths = pd.Series([len(lst) for lst in dataset['train']['text']])

# Print distribution
print(df_lengths.value_counts().sort_index())
print(token_lengths.value_counts().sort_index())

0     11096
1      4383
2      4306
3      3275
4      2486
5      1569
6      1071
7       561
8       428
9       275
10      153
11       91
12       79
13       25
14       39
15       31
16       12
17        6
18       10
19        5
20        4
22        1
24        1
26        1
Name: count, dtype: int64
634     1
643     1
655     1
678     1
698     1
       ..
2336    1
2342    1
2411    1
2692    1
2713    1
Name: count, Length: 1128, dtype: int64


In [14]:
dataset_none = dataset.filter(lambda x: len(x['privacy_mask']) == 0 and len(x['text']) <= 2048)
dataset_low = dataset.filter(lambda x: len(x['privacy_mask']) >= 4 and len(x['privacy_mask']) <= 6 and len(x['text']) <= 2048)
dataset_high = dataset.filter(lambda x: len(x['privacy_mask']) < 4 and len(x['privacy_mask']) > 0 and len(x['text']) <= 2048)

In [15]:
print(len(dataset_high['train']))

11964


In [16]:
train = concatenate_datasets([dataset_high['train'].select(range(600)),dataset_low['train'].select(range(450)), dataset_none['train'].select(range(225))])
val = concatenate_datasets([dataset_high['validation'].select(range(180)),dataset_low['validation'].select(range(90)), dataset_none['validation'].select(range(30))])

print(f"Training samples: {len(train)}")
print(f"Validation samples: {len(val)}")

Training samples: 1275
Validation samples: 300


In [17]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [18]:
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False
    )

model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

alpha = 2*r

In [19]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,                   
    lora_alpha=16,           
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

In [20]:
training_args = SFTConfig(
    output_dir="./sft_qwen_14b_output",

    num_train_epochs=1,                 
    gradient_checkpointing=True,
    max_length=2048,
    per_device_train_batch_size=1,
    eval_accumulation_steps=1,
    

    learning_rate=2e-4,                  
    warmup_ratio=0.1,                  
    
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,                      
    

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    packing=False,
    report_to="wandb",
    run_name="qwen-14b-pii",
    bf16=True,
    optim="paged_adamw_8bit",
)

In [21]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    peft_config=lora_config,
    processing_class=tokenizer,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.005
        )
    ]
)

In [22]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss
100,1.0884,1.112496
200,1.1261,1.028311
300,1.0084,0.998619
400,0.9535,0.980811
500,0.9124,0.988606
600,0.8742,0.963838
700,0.9441,0.952565
800,0.7533,0.942609
900,0.8682,0.938489
1000,0.9363,0.933728


TrainOutput(global_step=1000, training_loss=1.0412765340805055, metrics={'train_runtime': 9505.5441, 'train_samples_per_second': 0.134, 'train_steps_per_second': 0.134, 'total_flos': 2.709128071887667e+16, 'train_loss': 1.0412765340805055})

In [23]:
# Save the fine-tuned model
trainer.save_model("./sft_qwen_14b_output/final_model")
print("Model saved to ./sft_qwen_14b_output/final_model")

# Log final metrics
if wandb.run:
    wandb.finish()

Model saved to ./sft_qwen_14b_output/final_model


0,1
eval/entropy,█▆▅▃▃▁▂▂▁▁
eval/loss,█▅▄▃▃▂▂▁▁▁
eval/mean_token_accuracy,▁▄▅▆▅▇▇▇██
eval/num_tokens,▁▂▃▃▄▅▆▆▇█
eval/runtime,▆▄▅█▇▃▃▁▂▄
eval/samples_per_second,▁▁▁▁▁████▁
eval/steps_per_second,▁▁▁▁▁▁▁▁▁▁
train/entropy,▇█▇▃▄▃▂▃▂▃▂▂▂▂▃▂▂▂▃▂▂▂▂▃▃▂▂▂▂▁▂▂▂▁▂▃▂▂▂▂
train/epoch,▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇██

0,1
eval/entropy,0.94252
eval/loss,0.93373
eval/mean_token_accuracy,0.7898
eval/num_tokens,322422
eval/runtime,400.3402
eval/samples_per_second,0.749
eval/steps_per_second,0.095
total_flos,2.709128071887667e+16
train/entropy,0.93633
train/epoch,0.78431


In [208]:
api = wandb.Api()
run = api.run("/t-p-angevare-university-of-twente/Qwen-fine-tuning/runs/27s35zzj")
history = run.history()
print(history)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


    eval/entropy  eval/samples_per_second  _step  train/epoch  train/entropy  \
0            NaN                      NaN      0       0.0320       1.958536   
1            NaN                      NaN      1       0.0640       1.958171   
2            NaN                      NaN      2       0.0960       1.957968   
3            NaN                      NaN      3       0.1280       1.957781   
4            NaN                      NaN      4       0.1600       1.956328   
5       1.958243                    0.606      5       0.1600            NaN   
6            NaN                      NaN      6       0.1920       1.963717   
7            NaN                      NaN      7       0.2240       2.016932   
8            NaN                      NaN      8       0.2560       2.084045   
9            NaN                      NaN      9       0.2880       2.094191   
10           NaN                      NaN     10       0.3200       2.071899   
11      2.058485                    0.61