In [1]:
%pip install transformers peft bitsandbytes accelerate
%pip install -U bitsandbytes
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


First we need to split our data set into training and validation sets

In [2]:
import json

# Load the data
with open('data/fine_tuning/fine_tuning_data.json', 'r') as f:
    data = json.load(f)["text"]


# Save it as line-separated JSONL
with open('data/fine_tuning/fine_tuning_data.jsonl', 'w') as f:
    for item in data:
        f.write(json.dumps(item) + "\n")


In [3]:
# Load as JSONL (line-by-line)
with open('data/fine_tuning/fine_tuning_data.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

In [4]:
import random

# Shuffle data for randomness
random.seed(42)
random.shuffle(data)

In [5]:
train_size = int(0.7 * len(data))

train_data = data[:train_size]
valid_data = data[train_size:]

In [6]:
# Save training set
with open('data/fine_tuning/citations_train.jsonl', 'w') as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

# Save validation set
with open('data/fine_tuning/citations_valid.jsonl', 'w') as f:
    for item in valid_data:
        f.write(json.dumps(item) + "\n")


In [7]:
print(f"Training size: {len(train_data)}")
print(f"Validation size: {len(valid_data)}")


Training size: 2934
Validation size: 1258


In [8]:
#need to install latest version of numpy 1.x
%pip install numpy==1.26.0 --force-reinstall

Collecting numpy==1.26.0
  Using cached numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (99 kB)
Using cached numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.0
    Uninstalling numpy-1.26.0:
      Successfully uninstalled numpy-1.26.0
Successfully installed numpy-1.26.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # ✅ Switch to float16 for MPS
    device_map="auto"
)


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,  
    lora_alpha=32,  
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.05,  
    bias="none",  
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [11]:
%pip install datasets


Note: you may need to restart the kernel to use updated packages.


In [12]:
from datasets import load_dataset

data = load_dataset("json", data_files={"train": "data/fine_tuning/citations_train.jsonl", 
                                        "validation": "data/fine_tuning/citations_valid.jsonl"})


Generating train split: 2934 examples [00:00, 404114.28 examples/s]
Generating validation split: 1258 examples [00:00, 489102.19 examples/s]


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["prompt"], text_target=examples["completion"], padding="max_length", truncation=True, max_length=512)

tokenized_data = data.map(tokenize_function, batched=True)


Map: 100%|██████████| 2934/2934 [00:00<00:00, 6219.74 examples/s]
Map: 100%|██████████| 1258/1258 [00:00<00:00, 5045.50 examples/s]


In [14]:
%pip install tensorboard


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen2.5-1.5b-finetuned",
    evaluation_strategy="epoch",  # ✅ Evaluate after each epoch
    save_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=4,  # ✅ Works for 24GB with gradient accumulation
    per_device_eval_batch_size=4,  
    num_train_epochs=3,  
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,   # ✅ Less frequent logging for better performance
    save_steps=400,     # ✅ Save 2-3 times per epoch to avoid disk I/O spikes
    gradient_accumulation_steps=2,  # ✅ Lower to avoid memory spikes on 24GB
    report_to="tensorboard",
    fp16=False,  # ✅ Disable fp16 for MPS compatibility
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
)




In [17]:
print(f"Total samples: {len(tokenized_data['train']) + len(tokenized_data['validation'])}")


Total samples: 4192


In [18]:
max_input_length = max(len(sample["prompt"]) for sample in tokenized_data["train"])
print(f"Max input length: {max_input_length}")


Max input length: 324


In [21]:
max_output_length = max(len(sample["completion"]) for sample in tokenized_data["train"])
print(f"Max output length: {max_output_length}")

#find the citation with the largets length and print that citation
# Find the longest citation
longest_citation = max(tokenized_data["train"], key=lambda x: len(x["completion"]))
print("\nLongest citation:")
print(longest_citation["completion"])


Max output length: 297

Longest citation:
L.  1968, c. 410 (C. 52:14B-1 et seq.), the Commissioner of the Department of Human Services and the Commissioner of Corrections jointly shall adopt regulations establishing the procedures formulated under the plan required by section 2 of this act.      L.  1986, c. 71, s. 3, eff. July 30, 1986.


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 