In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer, SFTConfig
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split




In [None]:
import json
import random

# Load the JSON file (assumed to be a list of entries)
with open("crash_narratives.json", "r") as f:
    data = json.load(f)


# Shuffle the data to ensure randomness
random.shuffle(data)

# Define split ratios (70% train, 15% evaluation, 15% test)
n = len(data)
train_size = int(0.7 * n)
eval_size = int(0.15 * n)
# The rest of the data will be the test set
test_size = n - train_size - eval_size

# Split the data
train_data = data[:train_size]
eval_data = data[train_size:train_size + eval_size]
test_data = data[train_size + eval_size:]

# Save split datasets to separate JSONL files
def save_jsonl(dataset, filename):
    with open(filename, "w") as f:
        for entry in dataset:
            # Write each entry as a JSON-formatted string followed by a newline
            f.write(json.dumps(entry) + "\n")

save_jsonl(train_data, "train_data.jsonl")
save_jsonl(eval_data, "eval_data.jsonl")
save_jsonl(test_data, "test_data.jsonl")

# Print dataset sizes
print(f"Total samples: {n}")
print(f"Training data: {len(train_data)} samples")
print(f"Evaluation data: {len(eval_data)} samples")
print(f"Test data: {len(test_data)} samples")



In [None]:
def convert_jsonl_to_json(jsonl_file, json_file):
    with open(jsonl_file, "r") as f:
        data = [json.loads(line) for line in f]
    with open(json_file, "w") as f:
        json.dump(data, f)

convert_jsonl_to_json("train_data.jsonl", "train_data.json")
convert_jsonl_to_json("eval_data.jsonl", "eval_data.json")
convert_jsonl_to_json("test_data.jsonl", "test_data.json")

from datasets import load_dataset

data_files = {
    "train": "train_data.json",
    "validation": "eval_data.json",
    "test": "test_data.json"
}

dataset = load_dataset("json", data_files=data_files)

print(dataset['train'][0])
print(dataset['validation'][0])
print(dataset['test'][0])




In [None]:
train_data=dataset['train']
eval_data =dataset['validation']
test_data=dataset['test']

In [None]:
base_model_name = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.padding_side = "left"
special_tokens = ["<Speed and Stopping Violations>", 
                  "<Right-of-Way and Traffic Control Violations>", "<Lane and Direction Violations>", 
                  "<Maneuvering and Signaling Errors>", 
                  "<General Unsafe Driving>", "<NO HAZARDOUS ACTION>", "<BOTH DRIVERS TOOK HAZARDOUS ACTION>"]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
tokenizer.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)


import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)


from transformers import DataCollatorForLanguageModeling

class CustomDataCollator(DataCollatorForLanguageModeling):
    def __call__(self, examples):
        batch = super().__call__(examples)
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        for i in range(len(input_ids)):
            # Find assistant header
            assistant_idx = (input_ids[i] == tokenizer.convert_tokens_to_ids("<|start_header_id|>")).nonzero(as_tuple=True)[0]
            assistant_end_idx = (input_ids[i] == tokenizer.convert_tokens_to_ids("<|end_header_id|>")).nonzero(as_tuple=True)[0]
            # Get the <|end_header_id|> after assistant
            for start, end in zip(assistant_idx, assistant_end_idx):
                if tokenizer.decode(input_ids[i][start:end+1]) == "<|start_header_id|>assistant<|end_header_id|>":
                    assistant_end_pos = end
                    break
            # Skip the newline (\n) after <|end_header_id|>
            target_pos = assistant_end_pos + 2  # +1 for \n, +2 for the target token
            # Verify the target token
            target_token = tokenizer.decode([input_ids[i][target_pos]])
            # print(f"Sample {i}: Target position = {target_pos}, Token = {target_token}")
            # Mask labels: -100 everywhere except the target position
            mask = torch.ones_like(labels[i]) * -100
            mask[target_pos] = labels[i][target_pos]
            labels[i] = mask
        batch["labels"] = labels
        return batch

output_dir="MTCFLLM-llama-3.2-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.05,
    r=128,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = SFTConfig(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=2,                       # number of training epochs
    per_device_train_batch_size=32,            # batch size per device during training
    gradient_accumulation_steps=16,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=5,                         
    learning_rate=1e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    packing=False,
    max_seq_length=512,
    dataset_text_field='text',
    dataset_kwargs={
    "add_special_tokens": True,
    "append_concat_token": False,
    },
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.02,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 200,
    save_strategy="no"
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer,
    data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)
)



trainer.train()
# 1) Save the fine‑tuned LoRA adapters + base config
trainer.save_model(output_dir)  

# 2) Save the tokenizer (with new special tokens)
tokenizer.save_pretrained(output_dir)