In [1]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")


CUDA available: True
CUDA version: 12.1
GPU Device: NVIDIA GeForce RTX 4090


In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
wandb_api_key = os.getenv("WANDB_API_KEY")

if not huggingface_api_key:
    raise ValueError("HUGGINGFACE_API_KEY not set")

if not wandb_api_key:
    raise ValueError("WANDB_API_KEY not set")

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)

from peft import (
    LoraConfig,
    get_peft_model
)

import wandb
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer

In [4]:
from huggingface_hub import login

login(token=huggingface_api_key)

In [5]:
wandb.login(key=wandb_api_key)

run = wandb.init(
    project="repurposed-llm-phishing-classifier-causal-v2",
    job_type="train",
    anonymous="allow"
)

wandb: Currently logged in as: morganj-lee01 (morganj-lee01-team). Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\morga\_netrc
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [6]:
base_model_path = "../models/llama_models/llama-3.2-3B"
new_model_path = "../models/tuned_models/llama-3.2-3B-phishing-classifier-causal-v2"
train_dataset_path = "../processed_data/train.csv"
test_dataset_path = "../processed_data/test.csv"

In [7]:
train_df = pd.read_csv(train_dataset_path)

train_df.head()

Unnamed: 0,system,user,assistant
0,You are a classification system designed to ca...,Message for review: write me back please year ...,True
1,You are a classification system designed to ca...,Message for review: I just picked up Razor SDK...,False
2,You are a classification system designed to ca...,"Message for review: vacation goodmorning , i w...",False
3,You are a classification system designed to ca...,"Message for review: On Thu, Aug 08, 2002 at 11...",False
4,You are a classification system designed to ca...,Message for review: wellheads shoreline has se...,False


In [8]:
model_config = {
    "torch_dtype": torch.bfloat16,
    "attn_implementation": "flash_attention_2",
    "device_map": "auto"
}

In [9]:
# WARNING: flash_attention_2 required pip install flash-attn which needs C++ builds tools
# It also takes absolutely forever to compile because it's compiling CUDA kernels

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map=model_config["device_map"],
    torch_dtype=model_config["torch_dtype"],
    attn_implementation=model_config["attn_implementation"]
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)

In [11]:
print("Tokenizer config:")
print(tokenizer.init_kwargs)


Tokenizer config:
{'bos_token': AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'clean_up_tokenization_spaces': True, 'eos_token': AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'model_input_names': ['input_ids', 'attention_mask'], 'model_max_length': 131072, 'vocab_file': None, 'name_or_path': '../models/llama_models/llama-3.2-3B'}


In [12]:
tokenizer.chat_template = None

tokenizer.chat_template = """{% for message in messages %}<|start_header_id|>{{ message['role'] }}<|end_header_id|>{{ message['content'] }}<|eot_id|>{% endfor %}"""


In [13]:
test_messages = [
    {"role": "system", "content": "You are a classification system..."},
    {"role": "user", "content": "Message for review: test"},
    {"role": "assistant", "content": "true"}
]

output = tokenizer.apply_chat_template(test_messages, tokenize=False)
print("Length:", len(output))
print("Content:", output)

Length: 219
Content: <|start_header_id|>system<|end_header_id|>You are a classification system...<|eot_id|><|start_header_id|>user<|end_header_id|>Message for review: test<|eot_id|><|start_header_id|>assistant<|end_header_id|>true<|eot_id|>


In [14]:
special_tokens = {
    "additional_special_tokens": [
        "<|start_header_id|>",
        "<|end_header_id|>",
        "<|eot_id|>"
    ]
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(128256, 3072)

In [15]:
from functools import partial

train_dataset = Dataset.from_pandas(pd.read_csv(train_dataset_path))

def format_chat_template_batch(examples, tokenizer):  # Add tokenizer as parameter
    formatted_texts = []

    for system, user, assistant in zip(
        examples["system"],
        examples["user"],
        examples["assistant"]
    ):
        row_json = [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
            {"role": "assistant", "content": str(assistant)}
        ]
        formatted_texts.append(
            tokenizer.apply_chat_template(row_json,
                                          tokenize=False))

    examples["text"] = formatted_texts
    return examples


format_with_tokenizer = partial(format_chat_template_batch, tokenizer=tokenizer)

train_dataset = train_dataset.map(
    format_with_tokenizer,
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/14920 [00:00<?, ? examples/s]

In [16]:
tokens = tokenizer(train_dataset["text"][5])
print(tokens)


{'input_ids': [128000, 128006, 9125, 128007, 2675, 527, 264, 24790, 1887, 6319, 311, 2339, 99197, 6743, 311, 6144, 1274, 505, 16515, 12855, 323, 32638, 13, 1472, 690, 5371, 264, 1984, 369, 3477, 323, 422, 433, 374, 264, 99197, 2613, 11, 499, 28832, 6013, 449, 1193, 364, 1904, 6, 422, 433, 374, 99197, 11, 477, 364, 3934, 6, 422, 433, 374, 539, 99197, 13, 25274, 264, 16930, 477, 22109, 311, 26069, 449, 279, 2612, 3645, 649, 1121, 304, 6129, 11682, 311, 20134, 1274, 13, 128009, 128006, 882, 128007, 2097, 369, 3477, 25, 1952, 13479, 11, 220, 1187, 5033, 220, 1049, 17, 11, 15387, 445, 13, 2893, 7881, 6267, 25, 313, 60, 3112, 3686, 93350, 912, 606, 374, 704, 1070, 6968, 721, 898, 8106, 62, 2262, 13, 2209, 1070, 198, 313, 60, 17206, 832, 1732, 704, 1070, 649, 649, 1524, 3240, 311, 3137, 2085, 1694, 264, 198, 313, 60, 15079, 67586, 1269, 30, 1628, 912, 279, 330, 2569, 2592, 1, 1274, 16869, 3137, 3060, 11, 279, 198, 313, 60, 37934, 264, 396, 1524, 3345, 13, 358, 1440, 358, 16869, 3137, 627, 246

In [17]:
print("\nAll Special Tokens:")
for token in tokenizer.all_special_tokens:
    print(f"{token}: {tokenizer.convert_tokens_to_ids(token)}")

print("\nDecoded tokens:")
for token_id in tokens['input_ids']:
    print(f"{token_id}: {tokenizer.decode([token_id])}")



All Special Tokens:
<|begin_of_text|>: 128000
<|end_of_text|>: 128001
<|start_header_id|>: 128006
<|end_header_id|>: 128007
<|eot_id|>: 128009

Decoded tokens:
128000: <|begin_of_text|>
128006: <|start_header_id|>
9125: system
128007: <|end_header_id|>
2675: You
527:  are
264:  a
24790:  classification
1887:  system
6319:  designed
311:  to
2339:  catch
99197:  phishing
6743:  messages
311:  to
6144:  protect
1274:  people
505:  from
16515:  fraud
12855: sters
323:  and
32638:  criminals
13: .
1472:  You
690:  will
5371:  receive
264:  a
1984:  message
369:  for
3477:  review
323:  and
422:  if
433:  it
374:  is
264:  a
99197:  phishing
2613:  email
11: ,
499:  you
28832:  MUST
6013:  respond
449:  with
1193:  only
364:  '
1904: true
6: '
422:  if
433:  it
374:  is
99197:  phishing
11: ,
477:  or
364:  '
3934: false
6: '
422:  if
433:  it
374:  is
539:  not
99197:  phishing
13: .
25274:  Making
264:  a
16930:  mistake
477:  or
22109:  failing
311:  to
26069:  comply
449:  with
279:  t

In [18]:
test_dataset = Dataset.from_pandas(pd.read_csv(test_dataset_path))

test_dataset = test_dataset.map(
    format_with_tokenizer,
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/3730 [00:00<?, ? examples/s]

In [19]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "down_proj", "up_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
    inference_mode=False
)

model = get_peft_model(model, peft_config)

In [20]:
training_arguments = TrainingArguments(
    output_dir=new_model_path,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    optim="adamw_torch_fused",
    num_train_epochs=5,
    save_strategy="epoch",
    eval_strategy="epoch",
    do_eval=True,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    report_to="wandb",
    run_name="actual_run_2",
    logging_first_step=True,
    logging_dir="../logs",
    logging_strategy="steps",
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=10
)

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    args=training_arguments
)

Map:   0%|          | 0/14920 [00:00<?, ? examples/s]

Map:   0%|          | 0/3730 [00:00<?, ? examples/s]

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6211,1.78021
2,0.5954,1.68907
3,0.5927,1.64994
4,1.083,1.64132


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=2330, training_loss=1.5966714901218086, metrics={'train_runtime': 6981.4819, 'train_samples_per_second': 10.685, 'train_steps_per_second': 0.334, 'total_flos': 5.3268820243716096e+17, 'train_loss': 1.5966714901218086, 'epoch': 4.990884718498659})

In [23]:
wandb.finish()

0,1
eval/loss,█▄▂▁▁
eval/runtime,██▂▁▁
eval/samples_per_second,▁▁▇██
eval/steps_per_second,▁▁▇██
train/epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▇▇▇▇▇▇▇███
train/grad_norm,▄▃▃▂▃▄▄▃▂▄▅▂▂▃▅▄▅▃▅▂▅▁▄▆▄▅▅▅▃▇▄▇▆▆▁█▅█▃▆
train/learning_rate,▁█████▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▁▁▁
train/loss,▇███▇▇▇▇█▆▇▆▇▇▅▇▆▅▇▅▁▅▅▆▄▆▅▆▇▆▇▆▅▄▆▄▃▄▂▅

0,1
eval/loss,1.64132
eval/runtime,141.9506
eval/samples_per_second,26.277
eval/steps_per_second,13.138
total_flos,5.3268820243716096e+17
train/epoch,4.99088
train/global_step,2330.0
train/grad_norm,0.71274
train/learning_rate,0.0
train/loss,1.083


In [24]:
trainer.model.save_pretrained(new_model_path)