In [1]:
import random
import json
from datetime import datetime, timedelta
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    logging
)
from peft import LoraConfig, get_peft_model

In [2]:
# Enable logging for visibility
logging.set_verbosity_info()

In [3]:
# === Step 1: Preprocess JSON files to flatten 'response' field ===
def flatten_json(input_path):
    with open(input_path, "r") as f:
        raw_data = json.load(f)

    formatted_data = []
    for item in raw_data:
        instruction = item["instruction"].strip()
        response = json.dumps(item["response"], indent=2)  # Convert dict to string
        prompt = f"Instruction: {instruction} \nResponse:\n{response}"
        formatted_data.append({"text": prompt})
    return formatted_data

In [4]:
# Load and flatten train/test datasets
train_data = flatten_json(r"C:\Users\T14 gen2\Documents\SementicSearch\train_data.json")
test_data = flatten_json(r"C:\Users\T14 gen2\Documents\SementicSearch\test_data.json")

In [5]:
# Convert to HuggingFace Dataset objects
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

In [6]:
# === Step 2: Load model and tokenizer ===
model_path = r"C:\Users\T14 gen2\Documents\SementicSearch\Tinyllama"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id

loading file tokenizer.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file C:\Users\T14 gen2\Documents\SementicSearch\Tinyllama\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file

In [7]:
# === Step 3: Apply LoRA for efficient fine-tuning ===
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [8]:
# === Step 4: Tokenize the dataset ===
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

tokenized_train = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized_test = test_dataset.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [9]:
# === Step 5: Setup data collator ===
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# === Step 6: Define training arguments ===
training_args = TrainingArguments(
    output_dir="./tinyllama-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available(),
    save_steps=500,
    logging_steps=20,
    save_total_limit=2,
    eval_steps=500,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [11]:
# === Step 7: Setup Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 960
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 180
  Number of trainable parameters = 1,126,400


Step,Training Loss
20,1.0074
40,0.4438


In [None]:
# Save final model and tokenizer
trainer.save_model("./tinyllama-finetuned-detailed")
tokenizer.save_pretrained("./tinyllama-finetuned-detailed")