# LoRA finetuning for Qwen 2.5 14B

## Start with setup of environment

In [13]:
from datasets import load_dataset
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer
import gc
import wandb

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/home/jovyan/.local/lib/python3.10/site-packages/transformers/__init__.py)

## For Logging purposes

In [57]:
wandb.init(project="Qwen-fine-tuning", name="14b-ioc-extraction")

[34m[1mwandb[0m: Currently logged in as: [33mt-p-angevare[0m ([33mt-p-angevare-university-of-twente[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [58]:
gc.collect()
torch.cuda.empty_cache()

## Usage of AI4privacy dataset cleaning and implementation

https://huggingface.co/datasets/ai4privacy/pii-masking-300k 

In [59]:
dataset = load_dataset("ai4privacy/pii-masking-300k")
dataset = dataset.filter(lambda x: x['language'] == 'English')
dataset = dataset.select_columns(["source_text", "privacy_mask", "id"])
dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'id'],
        num_rows: 29908
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'id'],
        num_rows: 7946
    })
})

In [60]:
dataset_entity_mapping = {
    'EMAIL' : 'EMAIL',
    'LASTNAME1' : 'PERSON',
    'IP' : 'IP',
    'GIVENNAME1' : 'PERSON',
    'TEL' : 'PHONE',
    'CITY' : 'LOCATION',
    'POSTCODE' : 'LOCATION',
    'STREET': 'LOCATION',
    'STATE' : 'LOCATION',
    'BUILDING' : 'LOCATION',
    'COUNTRY' : 'LOCATION',
    'SECADDRESS' : 'LOCATION',
    'LASTNAME2' : 'PERSON',
    'GIVENNAME2' : 'PERSON',
    'GEOCOORD' : 'LOCATION',
    'LASTNAME3' : 'PERSON'
}

In [61]:
def clean_entities(privacy_mask, source_text):
    """
    Clean and combine entities, merging consecutive PERSON entities into full names.
    """
    entities = []
    for ent in privacy_mask:
        if ent['label'] in dataset_entity_mapping.keys():
            entities.append({
                'type': dataset_entity_mapping.get(ent['label']),
                'text': ent['value'],
                'start_pos': ent['start'],
                'end_pos': ent['end'],
                'original_label': ent['label']
            })
    
 
    entities.sort(key=lambda x: x['start_pos'])

    merged_entities = []
    i = 0
    while i < len(entities):
        current = entities[i]
        
        if current['type'] == 'PERSON' and i + 1 < len(entities):
            next_ent = entities[i + 1]
            
            if (next_ent['type'] == 'PERSON' and 
                next_ent['start_pos'] - current['end_pos'] <= 3):

                is_first_given = 'GIVENNAME' in current['original_label']
                is_second_last = 'LASTNAME' in next_ent['original_label']
                is_first_last = 'LASTNAME' in current['original_label']
                is_second_given = 'GIVENNAME' in next_ent['original_label']
                
                if (is_first_given and is_second_last) or (is_first_last and is_second_given):
                    # Merge into full name
                    full_name = source_text[current['start_pos']:next_ent['end_pos']]
                    merged_entities.append({
                        'type': 'PERSON',
                        'text': full_name,
                        'start_pos': current['start_pos'],
                        'end_pos': next_ent['end_pos']
                    })
                    i += 2 
                    continue
        
  
        merged_entities.append({
            'type': current['type'],
            'text': current['text'],
            'start_pos': current['start_pos'],
            'end_pos': current['end_pos']
        })
        i += 1
    
    return merged_entities

In [62]:
dataset = dataset.map(lambda x: {'privacy_mask': clean_entities(x['privacy_mask'], x['source_text'])})

Map:   0%|          | 0/29908 [00:00<?, ? examples/s]

Map:   0%|          | 0/7946 [00:00<?, ? examples/s]

In [63]:

original_train_size = len(dataset['train'])
original_val_size = len(dataset['validation'])

dataset = dataset.filter(lambda x: len(x['privacy_mask']) > 0)


Filter:   0%|          | 0/29908 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7946 [00:00<?, ? examples/s]

Filtered out empty samples:
  Train: 29908 -> 20042 (9866 removed)
  Val: 7946 -> 5215 (2731 removed)


## Set up LLM

In [65]:
prompt = """
You are a cyber intelligence analyst with 20 years of experience in the the field.

Your task is to extract any entity from the input text. For each entity found you MUST indicate the type in UPPERCASE. ONLY extract entities if literal entity is present in input text.
The expected entity types are the following:

- EMAIL: email addresses format (user@domain.tld)
- IP: IP addresses (IPv4 x.x.x.x or IPv6)
- BTC: ONLY Bitcoin wallet addresses (26-35 alphanumeric, starting with 1, 3, or bc1) EXCLUDE the word bitcoin or values (for example 2.0 BTC)
- IBAN: iban bank account number
- PERSON: Human names (John Smith, John, Catalina) EXCLUDE initials (for example A.H.) 
- LOCATION: cities, countries, geographic locations, regions
- PHONE: phone numbers in any format
- URL: URLs and web addresses EXCLUDE filenames
- TOX: Tox messenger IDs (76 character hexadecimal strings)
 
**Output**:
The output MUST be in a JSON object with key 'entities' and the value a list of dictionaries including every entity found. For each entity you MUST indicate the type in UPPERCASE.

**OUTPUT EXAMPLE**:
{
  "entities": [
    {"entity": "<extracted text>", "type": "<TYPE>"},
    {"entity": "<extracted text>", "type": "<TYPE>"},
  ]
}

Return empty array if no entities found in the input text.
PAY ATTENTION to sentences that begin with entity type PERSON, for example Anna.
PAY ATTENTION to when the sentences begin with possesive forms of entity type PERSON, for example Catalina's
PAY ATTENTION to when the sentences contain a FULL NAME, the FULL NAME MUST be extracted as ONE entity.
DO NOT include any entities from the example or the system prompt in your answer.
"""

In [66]:
model_name = "Qwen/Qwen2.5-14B" 
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [67]:
import json

def convert_to_chatml(source_text, privacy_mask):
    # Convert to the JSON format expected by the prompt
    entities_json = {
        "entities": [
            {"entity": ent['text'], "type": ent['type']} 
            for ent in privacy_mask
        ]
    }
    
    return [
        {"role": "system", "content": prompt},
        {"role": "user", "content": source_text},
        {"role": "assistant", "content": json.dumps(entities_json, indent=2)}
    ]

In [68]:

dataset = dataset.map(lambda x: {"messages": convert_to_chatml(x['source_text'], x['privacy_mask'])})

Map:   0%|          | 0/20042 [00:00<?, ? examples/s]

Map:   0%|          | 0/5215 [00:00<?, ? examples/s]

In [70]:
# Reduced dataset for faster training (~2-3 hours instead of 12+)
train = dataset['train'].select(range(5000))  # 5k samples (was 30k)
val = dataset['validation'].select(range(500))

print(f"Training samples: {len(train)}")
print(f"Validation samples: {len(val)}")

Training samples: 5000
Validation samples: 500


In [71]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [72]:
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False
    )

model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

alpha = 2*r

In [73]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,                   
    lora_alpha=16,           
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [74]:
from transformers import EarlyStoppingCallback

training_args = SFTConfig(
    output_dir="./sft_qwen_14b_output",

    num_train_epochs=3,                 
    
    max_length=512,
    per_device_train_batch_size=1,       
    gradient_accumulation_steps=16,      
    

    learning_rate=1e-5,                  
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    max_grad_norm=0.5,                   
    
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=50,                      
    

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,
    

    packing=False,
    report_to="wandb",
    run_name="qwen-14b-pii",
    bf16=True,
    optim="adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    peft_config=lora_config,
    processing_class=tokenizer,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.005
        )
    ]
)

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Training configuration for DeepSeek-R1-Distill-Qwen-14B:
  - Train samples: 5000
  - Epochs: 3
  - Effective batch size: 16
  - Learning rate: 1e-05
  - Eval every 50 steps
  - Max grad norm: 0.5
  - Early stopping patience: 2 evals


In [None]:
trainer.train(resume_from_checkpoint=True)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,2.4144,2.408647,2.19884,409587.0,0.515223
100,2.2732,2.255872,2.296701,819176.0,0.522957


In [None]:
# Save the fine-tuned model
trainer.save_model("./sft_qwen_14b_output/final_model")
print("Model saved to ./sft_qwen_14b_output/final_model")

# Log final metrics
if wandb.run:
    wandb.finish()

Model saved to ./sft_qwen_14b_output/final_model


0,1
eval/entropy,██▆▃▁▁▁▁▁
eval/loss,█▇▆▂▁▁▁▁▁
eval/mean_token_accuracy,▁▁▂▇█████
eval/num_tokens,▁▂▃▄▅▅▆▇█
eval/runtime,█▄▅▅▆▁▁▁▇
eval/samples_per_second,▁▄▄▄▄███▁
eval/steps_per_second,▁▁▁▁▁▁▁▁▁
train/entropy,████████████▇▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
eval/entropy,0.6061
eval/loss,0.57973
eval/mean_token_accuracy,0.88522
eval/num_tokens,3642927.0
eval/runtime,828.6719
eval/samples_per_second,0.603
eval/steps_per_second,0.076
train/entropy,0.61093
train/epoch,1.4384
train/global_step,450


In [208]:
api = wandb.Api()
run = api.run("/t-p-angevare-university-of-twente/Qwen-fine-tuning/runs/27s35zzj")
history = run.history()
print(history)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


    eval/entropy  eval/samples_per_second  _step  train/epoch  train/entropy  \
0            NaN                      NaN      0       0.0320       1.958536   
1            NaN                      NaN      1       0.0640       1.958171   
2            NaN                      NaN      2       0.0960       1.957968   
3            NaN                      NaN      3       0.1280       1.957781   
4            NaN                      NaN      4       0.1600       1.956328   
5       1.958243                    0.606      5       0.1600            NaN   
6            NaN                      NaN      6       0.1920       1.963717   
7            NaN                      NaN      7       0.2240       2.016932   
8            NaN                      NaN      8       0.2560       2.084045   
9            NaN                      NaN      9       0.2880       2.094191   
10           NaN                      NaN     10       0.3200       2.071899   
11      2.058485                    0.61