# **Actual Code**

In [15]:
# Requirements (run once in a cell):
!pip install -q transformers==4.57.1 datasets accelerate peft bitsandbytes

In [16]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [17]:
# Imports
import os
import json
import math
import random
import time # Import time for sleep
from pathlib import Path

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [18]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/drive/MyDrive/Mtech_Final_Project/JSONL_OUT_v5.jsonl")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 104502
    })
})

In [19]:
def split_prompt_completion(example):
    text = example["text"]
    if "<|assistant|>" in text:
        parts = text.split("<|assistant|>", 1)
        return {
            "prompt": parts[0].strip(),
            "completion": parts[1].strip()
        }
    else:
        return {
            "prompt": text.strip(),
            "completion": ""  # or handle accordingly
        }

dataset = dataset.map(split_prompt_completion,
                       remove_columns=["text"])
print(dataset)
#print(dataset["train"][0])  # verify example


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 104502
    })
})


In [20]:
print(dataset["train"][0]["completion"])


Most probable fault location: Bus 14, associated with transmission lines ['1', '7', ',', ' ', '2', '0', ',', ' ', '2', '6', ',', ' ', '3', '3', ',', ' ', '3', '4']. The GNN score of 1.0 indicates a severe anomaly concentrated at this node.
Evidence from GNN output: GNNExplainer shows [14, 15, 16, 17, 20, 21] buses influencing the affected node, suggesting voltage instability propagation.
Impact on grid stability: High risk of cascading failure and reactive power deficit spreading to buses 9 and 17. Potential overloading of lines 17, 20, 26, 33, 34, which may lead to under-voltage load shedding.


In [21]:
train_test = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_test["train"].to_json("/content/drive/MyDrive/Mtech_Final_Project/train.json")
train_test["test"].to_json("/content/drive/MyDrive/Mtech_Final_Project/test.json")

print(train_test)

Creating json from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 94051
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10451
    })
})


In [22]:
# -----------------------
# USER CONFIG (change as needed)
# -----------------------

MODEL_NAME = "microsoft/phi-3-mini-4k-instruct"
#MODEL_NAME = "facebook/opt-350m"
#MODEL_NAME = "facebook/opt-1.3b"    # choose a model that works well on T4 in 4-bit (e.g., opt-1.3b, llamacpp-compatible small models)
TRAIN_FILE = "train.json"          # your train.json (json lines with {"prompt": "...", "completion": "..."} )
TEST_FILE  = "test.json"
OUTPUT_DIR = "/content/drive/MyDrive/Mtech_Final_Project/grid_llm_qora"  # set to a Google Drive path if you mounted drive (recommended)
USE_GOOGLE_DRIVE = True           # set True if you mounted drive
MAX_SAMPLES = None                # set to an int to quick-test (e.g., 2000). None uses all samples.
MAX_STEPS = 1500                  # stop early (useful for Colab). Set None to use full epochs.
EPOCHS = 3                        # will be ignored if MAX_STEPS is set
BATCH_SIZE = 1                    # per-device batch size; T4 may require 1
GRAD_ACC = 4                      # gradient_accumulation_steps
LEARNING_RATE = 2e-4
SAVE_STEPS = 200
EVAL_STEPS = 200
MAX_RETRIES = 5                   # Number of retries for dataset loading
RETRY_DELAY = 10                  # Delay in seconds between retries

In [23]:
# -----------------------
# Optional: mount Google Drive if using it
# -----------------------
if USE_GOOGLE_DRIVE:
    from google.colab import drive
    drive_mount_path = "/content/drive"
    # Check if the mountpoint is accessible
    if not os.path.exists(drive_mount_path):
        print(f"Google Drive mountpoint {drive_mount_path} not found. Please ensure Google Drive is mounted.")
    else:
        drive.mount(drive_mount_path, force_remount=True)
        os.makedirs(OUTPUT_DIR, exist_ok=True)
else:
    os.makedirs(OUTPUT_DIR, exist_ok=True)

# Quick GPU check
print("CUDA available:", torch.cuda.is_available())
try:
    !nvidia-smi
except:
    pass

Mounted at /content/drive
CUDA available: True
Sun Nov  2 04:30:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
 

In [24]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)


Mounted at /content/drive


In [25]:
dataset = load_dataset("json", data_files={
    "train": "/content/drive/MyDrive/Mtech_Final_Project/train.json",
    "test": "/content/drive/MyDrive/Mtech_Final_Project/test.json"
})

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 94051
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10451
    })
})


In [26]:
# optional quick subset to iterate fast (uncomment to debug)
if dataset is not None and MAX_SAMPLES is not None:
    dataset["train"] = dataset["train"].select(range(min(MAX_SAMPLES, len(dataset["train"]))))
    dataset["test"]  = dataset["test"].select(range(min(MAX_SAMPLES//10 if MAX_SAMPLES else len(dataset["test"]), len(dataset["test"]))))

if dataset is not None:
    print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 94051
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10451
    })
})


In [27]:
!pip install bitsandbytes>=0.43.1.

In [28]:
for name, module in model.named_modules():
    print(name)


NameError: name 'model' is not defined

In [29]:
# -----------------------
# Prepare tokenizer
# -----------------------
if dataset is not None:
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # -----------------------
    # Quantization config for 4-bit QLoRA
    # -----------------------
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",                # nf4 recommended
        bnb_4bit_compute_dtype=torch.float16      # float16 for T4 (bfloat16 not supported well on T4)
    )

    # -----------------------
    # Load model in 4-bit
    # -----------------------
    print("Loading model in 4-bit (this may take a while)...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"   # allow HF to place layers across devices
    )

    # Prepare model for k-bit training (freezes norms etc)
    model = prepare_model_for_kbit_training(model)

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # -----------------------
    # Apply LoRA (PEFT)
    # -----------------------
    print("Applying LoRA adapters...")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        #target_modules=["q_proj", "v_proj"],  # focus on Q and V projection matrices
        target_modules=["qkv_proj", "o_proj"],  #foccus on self attention "qkv_proj", "o_proj" projection matrices

        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Loading model in 4-bit (this may take a while)...


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Applying LoRA adapters...
trainable params: 4,718,592 || all params: 3,825,798,144 || trainable%: 0.1233


In [None]:
# -----------------------
# Tokenize / format examples
# -----------------------
# Tokenize function
def tokenize_fn(examples):
    inputs = tokenizer(
        examples["prompt"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        examples["completion"],
        max_length=256,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply to both splits in batched mode
tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["prompt", "completion"]
)


In [None]:
# Inspect tokenization + labels for one example
i = 0
sample = tokenized["train"][i]   # or tokenized_dataset["train"][i]
input_ids = sample["input_ids"]
labels = sample["labels"]
print("Decoded input:", tokenizer.decode(input_ids, skip_special_tokens=True))
# labels likely are a list; -100 marks masked tokens
print("Labels has -100 for prompt part? first 50 labels:", labels[:50])

In [None]:
# Verify correct shapes
print(tokenized)
print(tokenized["train"][0].keys())
print(len(tokenized["train"][0]["input_ids"]))  # should be ~512

In [None]:
batch = tokenized["train"][0]
for k, v in batch.items():
    print(k, type(v), len(v) if hasattr(v, '__len__') else v)

# Re-Tokenizing

In [30]:
def tokenize_mask_prompt(example):
    prompt = example["prompt"]  # Replace with your actual prompt column name
    completion = example["completion"]  # Replace with your actual completion column

    # Combine prompt + assistant response
    full_text = (
        prompt.strip()
        + "\n<|assistant|>\n"
        + completion.strip()
        + tokenizer.eos_token
    )

    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    input_ids = tokenized["input_ids"]
    labels = input_ids.copy()  # start with same tokens

    # Mask prompt tokens with -100
    prompt_len = len(
        tokenizer(
            prompt.strip() + "\n<|assistant|>\n",
            truncation=True,
            max_length=512,
            add_special_tokens=False
        )["input_ids"]
    )

    labels[:prompt_len] = [-100] * prompt_len  # mask prompt tokens

    return {
        "input_ids": input_ids,
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    }

# Run this on dataset
tokenized_dataset = dataset.map(tokenize_mask_prompt, batched=False, remove_columns=dataset["train"].column_names)
print(tokenized_dataset)


Map:   0%|          | 0/94051 [00:00<?, ? examples/s]

Map:   0%|          | 0/10451 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 94051
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10451
    })
})


In [31]:
sample = tokenized_dataset["train"][0]
print(len(sample["input_ids"]), len(sample["labels"]), len(sample["attention_mask"]))
print(sample["labels"][:50])  # should show -100 for prompt tokens


512 512 512
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [32]:
# -----------------------
# Data collator
# -----------------------
if dataset is not None:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# -----------------------
# Training arguments (Colab optimized)
# -----------------------
if dataset is not None:
    if MAX_STEPS is not None:
        max_steps_val = MAX_STEPS
        num_train_epochs = EPOCHS  # still set but Trainer will stop at max_steps
    else:
        max_steps_val = -1
        num_train_epochs = EPOCHS

    training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,     #  smaller batch
    gradient_accumulation_steps=1,     #  fewer accumulations
    learning_rate=5e-5,
    num_train_epochs=2,                #  1 epoch max for CPU
    max_steps=500,                     #  stop early (optional)
    logging_steps=10,
    eval_strategy="no",          #  disable evaluation for faster training
    save_strategy="no",                #  don’t save checkpoints
    fp16=True,                        #  must be off for CPU
    bf16=False,                        #  off
    remove_unused_columns=False,
    push_to_hub=False,
)


In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,       # keep small for Colab T4 / 1 GPU
    gradient_accumulation_steps=8,       # simulate larger batch
    learning_rate=5e-4,                  # LoRA-friendly LR
    warmup_steps=50,
    weight_decay=0.0,
    fp16=True,                        #  must be off for CPU
    bf16=False,                        #  off
    remove_unused_columns=False,
    push_to_hub=False,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    num_train_epochs=2,      # increase if training more
    )

for faster training

In [71]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,            # start small
    logging_steps=50,
    fp16=True,
    max_steps=1500,
    save_strategy="epoch",
    eval_strategy="no",
    save_total_limit=1,
    report_to="none",
)

In [72]:
# -----------------------
# Trainer
# -----------------------
if dataset is not None:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

  trainer = Trainer(


In [73]:
# -----------------------
# Train (with try/except for safe checkpointing)
# -----------------------
if dataset is not None:
    print("Starting training...")
    try:
        trainer.train(resume_from_checkpoint=False) #for new model training
        #trainer.train(resume_from_checkpoint=True)  #for retraining the previous model
    except Exception as e:
        print("Training interrupted with exception:", e)
        print("Saving model checkpoint to OUTPUT_DIR so you can resume later.")
        trainer.save_model(OUTPUT_DIR)

Starting training...


Step,Training Loss
50,0.04
100,0.0401
150,0.0378
200,0.0375
250,0.0373
300,0.0366
350,0.036
400,0.0349
450,0.0365
500,0.0348


In [74]:
import os
print(os.listdir("/content/drive/MyDrive/Mtech_Final_Project/grid_llm_qora"))

['runs', 'training_args.bin', 'README.md', 'adapter_model.safetensors', 'special_tokens_map.json', 'adapter_config.json', 'tokenizer_config.json', 'merges.txt', 'vocab.json', 'tokenizer.json', 'chat_template.jinja', 'added_tokens.json', 'tokenizer.model', 'checkpoint-1500']


In [75]:
# -----------------------
# Save final model (LoRA adapters + quantized base)
# -----------------------
if dataset is not None:
    print("Saving final model...")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Model saved to:", OUTPUT_DIR)

Saving final model...
Model saved to: /content/drive/MyDrive/Mtech_Final_Project/grid_llm_qora


In [76]:
print(dataset)
print(dataset["test"].features)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 94051
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10451
    })
})
{'prompt': Value('string'), 'completion': Value('string')}


In [77]:
# -----------------------
# Quick test: generate on a few prompts
# -----------------------
if dataset is not None:
    from transformers import pipeline
    gen_pipe = pipeline("text-generation", model=OUTPUT_DIR, tokenizer=tokenizer, device_map="auto")
    sample_prompt = dataset["test"][1]["prompt"]
    print("\nSample prompt:\n", sample_prompt)
    out = gen_pipe(sample_prompt, max_new_tokens=120, do_sample=True, temperature=0.7)
    print("\nModel output:\n", out[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0



Sample prompt:
 <|system|>
You are an advanced AI Fault Diagnosis Expert trained to analyze electrical power grids using graph-based neural networks.
Nodes represent electrical buses with voltage/load features; edges represent transmission lines carrying power flow.
GNN scores indicate fault likelihood; GNNExplainer identifies influential components.

<|user|>
GNN summary: top_nodes=[14, 21, 17]; scores=[1.0, 0.627, 0.586]; implicated_lines=56, 57, 58, 59, 61; severity=HIGH; GNNExplainer_affecting={"14": [2, 13, 14, 15, 16, 17, 18, 20, 21, 23], "21": [14, 15, 16, 17, 20, 21], "17": [14, 15, 16, 17, 20, 21]}
### Task (Role: Expert Power Grid Fault Diagnostician)
You are a domain-specific AI trained to analyze electrical faults in graph-structured power grids.
- Nodes represent electrical buses.
- Edges represent transmission lines.
- GNN scores indicate likelihood of fault occurrence.
- GNNExplainer identifies key contributing grid components.
Follow the output structure exactly:
Diagn