In [None]:
import json
from datasets import Dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import random
import time
import matplotlib.pyplot as plt
from validator import Validator
import csv 
from torch.optim.lr_scheduler import StepLR  # or use ReduceLROnPlateau, etc.
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # Replace 1 with the GPU index you want

print(torch.cuda.get_device_name(0))  # This will now refer to GPU 1 as "GPU 0" in the notebook context
print(torch.cuda.is_available())

print("Device count:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU name:", torch.cuda.get_device_name(0))


In [None]:
# PREPROCESSING WITH HEX TO BYTE 

# BOS = Beginning of Sequence → marks the start of a Modbus message.
# SEP = Separator → marks the boundary between the query and the response.
# EOS = End of Sequence → marks the end of the full sequence.
# PAD = Padding token → used to pad shorter sequences so they fit in a batch.
# VOCAB_SIZE = 260 → bytes (0–255) + 4 special tokens (256–259)

# Constants
BOS, SEP, EOS, PAD = 256, 257, 258, 259
VOCAB_SIZE = 260

# Load and preprocess dataset
#hexadecimal string into a list of byte values
def hex_to_bytes(hex_str):
    # 2-character group is treated as a hex byte and converted to decimal
    return [int(hex_str[i:i+2], 16) for i in range(0, len(hex_str), 2)]

def preprocess(example):
    q = hex_to_bytes(example["query"])
    r = hex_to_bytes(example["response"])
    
    input_ids = [BOS] + q + [SEP] + r + [EOS]
    sep_index = len(q) + 1  # BOS + query
    label_start = sep_index + 1
    labels = [-100] * label_start + input_ids[label_start:]

    return {
        "input_ids": input_ids,
        "labels": labels
    }

with open("modbus_dataset.jsonl", "r") as f:
    train_data = [json.loads(line) for line in f]

with open("modbus_dataset_test.jsonl", "r") as f:
    test_data = [json.loads(line) for line in f]

with open("modbus_dataset_validation.jsonl", "r") as f:
    validation_data = [json.loads(line) for line in f]

#turns a list of dictionaries into a Dataset object
train_dataset = Dataset.from_list([preprocess(d) for d in train_data])
test_dataset = Dataset.from_list([preprocess(d) for d in test_data])
val_dataset = Dataset.from_list([preprocess(d) for d in validation_data])

In [None]:
# Model WITHOUT WEIGHT INITIALIZATION
class DecoderOnlyTransformer(nn.Module):
    # vocab_size = how many distinct tokens we can embed
    #d_model =  size of each embedding vector.
    #n_heads: number of attention heads. Helps the model focus on different parts of the sequence simultaneously.
    # n_layers: number of transformer layers stacked
    def __init__(self, vocab_size, d_model, n_heads, n_layers, dropout=0.1):
        super().__init__()
        #Converts each token ID into a dense vector of dimension d_model.
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
        self.pos_emb = nn.Embedding(512, d_model)
        # Even though we call it "encoder layer", apply causal masks later to make it autoregressive like a decoder.
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4, dropout=dropout, batch_first=True
            ) for _ in range(n_layers)
        ]) # dim_feedforward = Size of the hidden layer inside the feed-forward network of the Transformer.
        self.norm = nn.LayerNorm(d_model)
        #Maps the output of the model (of shape [batch, seq_len, d_model]) to logits over the vocabulary.
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        B, T = x.size() # B is the batch size, T is the sequence length (number of tokens in each input)
        positions = torch.arange(0, T, device=x.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(x) + self.pos_emb(positions)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device) # Ensures the model can only attend to current and past tokens, not future ones
        for layer in self.layers:
            x = layer(x, src_mask=tgt_mask)  # Applies each encoder layer sequentially
        x = self.norm(x)
        return self.output(x) # Projects the final hidden states back to vocabulary space: for each token position, the model predicts a probability distribution over the 260 tokens (0–259)

# Collate
def collate(batch):
    max_len = max(len(x["input_ids"]) for x in batch)
    padded_inputs = [x["input_ids"] + [PAD] * (max_len - len(x["input_ids"])) for x in batch]
    padded_labels = [x["labels"] + [-100] * (max_len - len(x["labels"])) for x in batch]
    
    return {
        "input_ids": torch.tensor(padded_inputs, dtype=torch.long),
        "labels": torch.tensor(padded_labels, dtype=torch.long)
    }

# Generate
def generate(model, input_seq, max_len=32):
    model.eval()
    with torch.no_grad():
        #Copies the input sequence so the original is not modified
        seq = input_seq[:]
        for _ in range(max_len):
            x = torch.tensor([seq], dtype=torch.long).to(next(model.parameters()).device)
            #print("Whatever is X: ", x)
            logits = model(x)
            next_token = logits[0, -1].argmax().item()

            if next_token == EOS:
                break
            seq.append(next_token)
        return seq

# Evaluate
def evaluate(model, dataset, device, print_limit=5):
    correct, total, shown = 0, 0, 0
    for item in dataset:
        tokens = item["input_ids"]
        sep_idx = tokens.index(SEP)
        query = tokens[:sep_idx+1]
        true = tokens[sep_idx+1:-1]

        pred = generate(model, query)
        pred = pred[pred.index(SEP)+1:]
        if EOS in pred:
            pred = pred[:pred.index(EOS)]

        matches = sum(1 for i in range(min(len(true), len(pred))) if true[i] == pred[i])
        correct += matches
        total += len(true)

        if shown < print_limit:
            def tohex(x): return ''.join(f"{b:02x}" for b in x)
            print("\nQuery:             ", tohex(query[1:-1]))
            print("True Response:     ", tohex(true))
            print("Predicted Response:", tohex(pred))
            shown += 1
    acc = 100 * correct / total
    print(f"\n✅ Byte-Level Accuracy: {acc:.2f}%")
    return acc

def evaluate_with_validator(model, dataset, device, print_limit=5, end_address=3, save_errors=True, error_log_file="validator_failures.csv"):
    exact_matches = 0
    total_samples = 0
    shown = 0
    failed_validations = 0
    failed_cases = []

    is_byte_level = VOCAB_SIZE > 100
    id_to_char = {i: c for i, c in enumerate("0123456789abcdef")}
    all_logged_predictions = []


    for item in dataset:
        tokens = item["input_ids"]
        sep_idx = tokens.index(SEP)
        query = tokens[:sep_idx+1]
        true = tokens[sep_idx+1:-1]
        #print("Query is : ", query)
        pred = generate(model, query)
        #print("Prediction is: ", pred)

        pred = pred[pred.index(SEP)+1:]
        if EOS in pred:
            pred = pred[:pred.index(EOS)]

        if is_byte_level:
            q_hex = ''.join(f"{b:02x}" for b in query[1:-1])
            r_true_hex = ''.join(f"{b:02x}" for b in true)
            r_pred_hex = ''.join(f"{b:02x}" for b in pred)
        else:
            q_hex = ''.join(id_to_char[b] for b in query[1:-1])
            r_true_hex = ''.join(id_to_char[b] for b in true)
            r_pred_hex = ''.join(id_to_char[b] for b in pred)

        if pred == true:
            exact_matches += 1
        total_samples += 1

        all_logged_predictions.append({
        "query": q_hex,
        "expected_response": r_true_hex,
        "predicted_response": r_pred_hex,
        "is_exact_match": pred == true})


        try:
            val = Validator(q_hex, r_pred_hex, r_true_hex, end_address)
            val.check_header_ids()
            val.check_payload()
        except Exception as e:
            failed_validations += 1
            failed_cases.append({
                "query": q_hex,
                "expected_response": r_true_hex,
                "predicted_response": r_pred_hex,
                "error": str(e)
            })
            if shown < print_limit:
                print(f"\n❌ Validator Error: {e}")

        if shown < print_limit:
            print("\nQuery:             ", q_hex)
            print("True Response:     ", r_true_hex)
            print("Predicted Response:", r_pred_hex)
            shown += 1

    acc = 100 * exact_matches / total_samples if total_samples else 0
    val_rate = 100 * (total_samples - failed_validations) / total_samples if total_samples else 0

        # Optional: save all predictions to CSV
    all_predictions_file = "all_predictions.csv"
    with open(all_predictions_file, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["query", "expected_response", "predicted_response", "is_exact_match"])
        writer.writeheader()
        for item in all_logged_predictions:
            writer.writerow(item)
    print(f"📄 Saved all predictions to {all_predictions_file}")
    print(f"\n✅ Exact Match Accuracy: {acc:.2f}%")
    print(f"🛡️  Validator Pass Rate: {val_rate:.2f}%")

    # Optional: save failed cases to CSV
    if save_errors and failed_cases:
        with open(error_log_file, "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=["query", "expected_response", "predicted_response", "error"])
            writer.writeheader()
            writer.writerows(failed_cases)
        print(f"📝 Saved {len(failed_cases)} failed validations to {error_log_file}")

    return {
        "exact_match": acc,
        "validator_pass": val_rate,
        "total_samples": total_samples,
        "failed": failed_cases
    }

def evaluate_exact(model, dataset, device, print_limit=5):
    exact_matches = 0
    total_samples = 0
    shown = 0

    for item in dataset:
        tokens = item["input_ids"]
        sep_idx = tokens.index(SEP)
        query = tokens[:sep_idx+1]
        true = tokens[sep_idx+1:-1]  # remove EOS

        pred = generate(model, query)
        pred = pred[pred.index(SEP)+1:]
        if EOS in pred:
            pred = pred[:pred.index(EOS)]

        # Exact match check
        if pred == true:
            exact_matches += 1
        total_samples += 1

        # Optional print
        if shown < print_limit:
            def tohex(x): return ''.join(f"{b:02x}" for b in x)
            print("\nQuery:             ", tohex(query[1:-1]))
            print("True Response:     ", tohex(true))
            print("Predicted Response:", tohex(pred))
            shown += 1

    accuracy = 100 * exact_matches / total_samples if total_samples > 0 else 0
    print(f"\n✅ Exact Match Accuracy: {accuracy:.2f}%")
    return accuracy

def evaluate_loss(model, dataset, device):
    model.eval()
    loss_fn = nn.CrossEntropyLoss(reduction='sum')  # sum to average later
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for item in dataset:
            input_ids = item["input_ids"]
            sep_idx = input_ids.index(SEP)

            query = input_ids[:sep_idx+1]
            true_response = input_ids[sep_idx+1:]  # includes EOS

            for t in range(len(true_response)):
                # model input = query + previous tokens of the response
                model_input = query + true_response[:t]
                x = torch.tensor([model_input], dtype=torch.long).to(device)
                
                logits = model(x)
                next_logits = logits[0, -1]  # predict next token
                next_token = true_response[t]

                # Compute loss on this token
                loss = loss_fn(next_logits.view(1, -1), torch.tensor([next_token]).to(device))
                total_loss += loss.item()
                total_tokens += 1

    avg_loss = total_loss / total_tokens if total_tokens > 0 else float("inf")
    print(f"✅ Test Loss (no leakage): {avg_loss:.4f}")
    return avg_loss

def plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device):
    plt.figure(figsize=(8,5))
    plt.plot(train_loss, label="Train Loss")
    plt.plot(val_loss, label="Validation Loss")
    plt.plot(accuracy, label="Accuracy history" )
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss per Epoch")
    plt.legend()
    plt.grid(True)
    plt.show()

    print ("Bit by Bit accuracy")
    evaluate(model, test_dataset, device)
    print ("Exact Matching")
    evaluate_exact(model, test_dataset, device)
    print ("Validator Accuracy")
    results = evaluate_with_validator(
        model=model,
        dataset=test_dataset,
        device=device,
        print_limit=3,
        end_address=39,
        save_errors=True,
        error_log_file="validator_failures_vocab260.csv"
    )
    test_loss= evaluate_loss(model, test_dataset, device, loss_fn=None)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"🔢 Total Trainable Parameters: {num_params:,}")

def plot_evaluation_search (train_loss, val_loss, accuracy, model, test_dataset, device):
    plt.figure(figsize=(8,5))
    plt.plot(train_loss, label="Train Loss")
    plt.plot(val_loss, label="Validation Loss")
    plt.plot(accuracy, label="Accuracy history" )
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss per Epoch")
    plt.legend()
    plt.grid(True)
    plt.show()

    evaluate_exact(model, test_dataset, device)
    print ("Validator Accuracy")
    results = evaluate_with_validator(
        model=model,
        dataset=test_dataset,
        device=device,
        print_limit=3,
        end_address=39,
        save_errors=True,
        error_log_file="validator_failures_vocab260.csv"
    )
    metrics = { "exact_match": results ["exact_match"], "validator_pass": results["val_rate"]}
    return metrics

# Training
def autoregressive_generate(model, input_ids, max_len=64, device="cuda"):
    model.eval()
    B, T = input_ids.shape
    generated = input_ids.clone()

    for _ in range(max_len):
        logits = model(generated)  # shape: [B, T_cur, vocab]
        next_token_logits = logits[:, -1, :]  # take last time step
        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(1)
        generated = torch.cat([generated, next_token], dim=1)

    return generated

def train_transformer_change(model, train_dataloader, val_dataloader, hparams, device,  max_gen_len=64):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=hparams["lr"])
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    loss_fn = nn.CrossEntropyLoss(ignore_index=-100)  # ✅ Use -100 for masked labels

    train_loss_history = []
    val_loss_history = []
    acc_history=[]

    best_val_loss = float('inf')
    epochs_without_improvement = 0
    early_stop_patience = hparams.get("early_stop_patience", 3)  # stop if no improvement after 3 epochs

    for epoch in range(hparams["epochs"]):
        # -------------------- Training --------------------
        model.train()
        total_train_loss = 0
        for batch in train_dataloader:
            x = batch["input_ids"].to(device)
            y = batch["labels"].to(device)
            # Safety check
            bad_labels = y[(y != -100) & ((y < 0) | (y >= VOCAB_SIZE))]
            if bad_labels.numel() > 0:
                print("❌ Invalid label values:", bad_labels)
                raise ValueError("Some labels are outside the VOCAB_SIZE range")
            logits = model(x)[:, :-1, :]
            targets = y[:, 1:]
            loss = loss_fn(logits.reshape(-1, VOCAB_SIZE), targets.reshape(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_loss_history.append(avg_train_loss)
    #------------------Validation----------------------------------------------------------------

        model.eval()
        total_tokens = 0
        correct_tokens = 0
        total_val_loss = 0

        with torch.no_grad():
            for batch in val_dataloader:
                x_full = batch["input_ids"].to(device)        # includes query + SEP (+ response)
                y_true = batch["labels"].to(device)            # masked: -100 before response

                # Extract only the prompt part: up to and including SEP
                # sep_pos = (x_full == SEP).nonzero(as_tuple=True)[1].max().item()  # last SEP
                # x_prompt = x_full[:, :sep_pos+1]               # [BOS] + query + [SEP]

                # # Generate tokens
                # generated = autoregressive_generate(model, x_prompt, max_len=max_gen_len, device=device)

                # # Align with ground truth response
                # for pred, target in zip(generated, y_true):
                #     # Strip padding and -100
                #     gt_response = [t.item() for t in target if t.item() != -100]
                #     gen_response = pred[len(x_prompt[0]):len(x_prompt[0]) + len(gt_response)]

                #     total_tokens += len(gt_response)
                #     correct_tokens += sum(p == t for p, t in zip(gen_response, gt_response))
                    
                logits = model(x_full)[:, :-1, :]
                targets = y_true[:, 1:]
                loss = loss_fn(logits.reshape(-1, VOCAB_SIZE), targets.reshape(-1))
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_dataloader)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict()  # save best model weights
        else:
            epochs_without_improvement += 1
            print(f"🕒 No improvement for {epochs_without_improvement} epoch(s)")
            if epochs_without_improvement >= early_stop_patience:
                print("🛑 Early stopping triggered.")
                break
        scheduler.step(avg_val_loss)  # call with val loss if using ReduceLROnPlateau
        for param_group in optimizer.param_groups:
            print(f"🔧 Current learning rate: {param_group['lr']}")
        val_loss_history.append(avg_val_loss)
        acc = correct_tokens / total_tokens if total_tokens > 0 else 0.0
        acc_history.append(acc.item() if isinstance(acc, torch.Tensor) else acc)
        print(f"Validation Accuracy: {acc:.4f}")
        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        
    model.load_state_dict(best_model_state)
    return train_loss_history, val_loss_history, acc_history





In [None]:
def best_hyper():
    return {
        "d_model": 128,
        "n_heads": 4,
        "n_layers": 4,
        "lr": 5e-4,
        "batch_size": 16,
        "epochs": 50,
        "early_stop_patience": 5
    }

def try_million_parameters():
    return{
    "d_model": 256,        # embedding size
    "n_heads": 8,          # must divide d_model evenly
    "n_layers": 6,         # number of Transformer blocks
    "lr": 5e-4,
    "batch_size": 16,
    "epochs": 50,
    "early_stop_patience": 10
}

In [None]:
# Experiment 1: With Hexadecimal to Byte translation, best configuration, no weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = best_hyper()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)



In [None]:
# Experiment 2: With Hexadecimal to Byte translation, Big architecture, no weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = try_million_parameters()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)



In [None]:
# Model with weight Initialization
class DecoderOnlyTransformer(nn.Module):
    # vocab_size = how many distinct tokens we can embed
    #d_model =  size of each embedding vector.
    #n_heads: number of attention heads. Helps the model focus on different parts of the sequence simultaneously.
    # n_layers: number of transformer layers stacked
    def __init__(self, vocab_size, d_model, n_heads, n_layers, dropout=0.1):
        super().__init__()
        #Converts each token ID into a dense vector of dimension d_model.
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
        self.pos_emb = nn.Embedding(512, d_model)
        # Even though we call it "encoder layer", apply causal masks later to make it autoregressive like a decoder.
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4, dropout=dropout, batch_first=True
            ) for _ in range(n_layers)
        ]) # dim_feedforward = Size of the hidden layer inside the feed-forward network of the Transformer.
        self.norm = nn.LayerNorm(d_model)
        #Maps the output of the model (of shape [batch, seq_len, d_model]) to logits over the vocabulary.
        self.output = nn.Linear(d_model, vocab_size)
        self.init_weights()  # <--- call your initializer

    def init_weights(self):
        # Token & Positional Embeddings: small normal noise
        nn.init.normal_(self.token_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.pos_emb.weight, mean=0.0, std=0.02)

        # Output layer: Xavier for balanced activations
        nn.init.xavier_uniform_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)

        # Initialize Transformer Encoder Layers
        for layer in self.layers:
            for name, param in layer.named_parameters():
                if param.dim() > 1:  # weights
                    if "linear" in name or "weight" in name:
                        nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.constant_(param, 0)

    def forward(self, x):
        B, T = x.size() # B is the batch size, T is the sequence length (number of tokens in each input)
        positions = torch.arange(0, T, device=x.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(x) + self.pos_emb(positions)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device) # Ensures the model can only attend to current and past tokens, not future ones
        for layer in self.layers:
            x = layer(x, src_mask=tgt_mask)  # Applies each encoder layer sequentially
        x = self.norm(x)
        return self.output(x) # Projects the final hidden states back to vocabulary space: for each token position, the model predicts a probability distribution over the 260 tokens (0–259)



In [None]:
# Experiment 3: With Hexadecimal to Byte translation, weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = best_hyper()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)

plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)



In [None]:
# Experiment 4: With Hexadecimal to Byte translation, big model, weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = try_million_parameters()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)
train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)

In [None]:
#No weight initialization
class DecoderOnlyTransformer(nn.Module):
    # vocab_size = how many distinct tokens we can embed
    #d_model =  size of each embedding vector.
    #n_heads: number of attention heads. Helps the model focus on different parts of the sequence simultaneously.
    # n_layers: number of transformer layers stacked
    def __init__(self, vocab_size, d_model, n_heads, n_layers, dropout=0.1):
        super().__init__()
        #Converts each token ID into a dense vector of dimension d_model.
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
        self.pos_emb = nn.Embedding(512, d_model)
        # Even though we call it "encoder layer", apply causal masks later to make it autoregressive like a decoder.
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4, dropout=dropout, batch_first=True
            ) for _ in range(n_layers)
        ]) # dim_feedforward = Size of the hidden layer inside the feed-forward network of the Transformer.
        self.norm = nn.LayerNorm(d_model)
        #Maps the output of the model (of shape [batch, seq_len, d_model]) to logits over the vocabulary.
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        B, T = x.size() # B is the batch size, T is the sequence length (number of tokens in each input)
        positions = torch.arange(0, T, device=x.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(x) + self.pos_emb(positions)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device) # Ensures the model can only attend to current and past tokens, not future ones
        for layer in self.layers:
            x = layer(x, src_mask=tgt_mask)  # Applies each encoder layer sequentially
        x = self.norm(x)
        return self.output(x) # Projects the final hidden states back to vocabulary space: for each token position, the model predicts a probability distribution over the 260 tokens (0–259)


In [None]:
#No Conversion Hexadecimal Bytes to Decimal
# Constants
BOS, SEP, EOS, PAD = 16, 17, 18, 19
VOCAB_SIZE = 20  # 16 hex chars + 4 special tokens

def preprocess_hex_chars(example):
    char_to_id = {c: i for i, c in enumerate("0123456789abcdef")}
    q = [char_to_id[c.lower()] for c in example["query"]]
    r = [char_to_id[c.lower()] for c in example["response"]]
    input_ids = [BOS] + q + [SEP] + r + [EOS]
    # Create labels: PAD for query + SEP, actual IDs for response + EOS
    labels = [PAD] * (len(q) + 2) + r + [EOS]  # PAD for BOS + query + SEP
    return {
        "input_ids": input_ids,
        "labels": labels
    }

with open("modbus_dataset.jsonl", "r") as f:
    train_data = [json.loads(line) for line in f]

with open("modbus_dataset_test.jsonl", "r") as f:
    test_data = [json.loads(line) for line in f]

with open("modbus_dataset_validation.jsonl", "r") as f:
    validation_data = [json.loads(line) for line in f]

#turns a list of dictionaries into a Dataset object
train_dataset = Dataset.from_list([preprocess_hex_chars(d) for d in train_data])
test_dataset = Dataset.from_list([preprocess_hex_chars(d) for d in test_data])
val_dataset = Dataset.from_list([preprocess_hex_chars(d) for d in validation_data])

In [None]:
# Experiment 1b: With no translation, best configuration, no weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = best_hyper()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)



In [None]:
# Experiment 2b: With no translation, million configuration, no weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = try_million_parameters()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)


In [None]:
# Model with weight Initialization
class DecoderOnlyTransformer(nn.Module):
    # vocab_size = how many distinct tokens we can embed
    #d_model =  size of each embedding vector.
    #n_heads: number of attention heads. Helps the model focus on different parts of the sequence simultaneously.
    # n_layers: number of transformer layers stacked
    def __init__(self, vocab_size, d_model, n_heads, n_layers, dropout=0.1):
        super().__init__()
        #Converts each token ID into a dense vector of dimension d_model.
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
        self.pos_emb = nn.Embedding(512, d_model)
        # Even though we call it "encoder layer", apply causal masks later to make it autoregressive like a decoder.
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4, dropout=dropout, batch_first=True
            ) for _ in range(n_layers)
        ]) # dim_feedforward = Size of the hidden layer inside the feed-forward network of the Transformer.
        self.norm = nn.LayerNorm(d_model)
        #Maps the output of the model (of shape [batch, seq_len, d_model]) to logits over the vocabulary.
        self.output = nn.Linear(d_model, vocab_size)
        self.init_weights()  # <--- call your initializer

    def init_weights(self):
        # Token & Positional Embeddings: small normal noise
        nn.init.normal_(self.token_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.pos_emb.weight, mean=0.0, std=0.02)

        # Output layer: Xavier for balanced activations
        nn.init.xavier_uniform_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)

        # Initialize Transformer Encoder Layers
        for layer in self.layers:
            for name, param in layer.named_parameters():
                if param.dim() > 1:  # weights
                    if "linear" in name or "weight" in name:
                        nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.constant_(param, 0)

    def forward(self, x):
        B, T = x.size() # B is the batch size, T is the sequence length (number of tokens in each input)
        positions = torch.arange(0, T, device=x.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(x) + self.pos_emb(positions)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device) # Ensures the model can only attend to current and past tokens, not future ones
        for layer in self.layers:
            x = layer(x, src_mask=tgt_mask)  # Applies each encoder layer sequentially
        x = self.norm(x)
        return self.output(x) # Projects the final hidden states back to vocabulary space: for each token position, the model predicts a probability distribution over the 260 tokens (0–259)



In [None]:
# Experiment 3b: With no translation, best configuration,  weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = best_hyper()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)


In [None]:
# Experiment 4b: With no translation, million configuration,  weight initialization, early stop, learning rate scheduling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = try_million_parameters()
model = DecoderOnlyTransformer(VOCAB_SIZE, hparams["d_model"], hparams["n_heads"], hparams["n_layers"])
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
plot_evaluation(train_loss, val_loss, accuracy, model, test_dataset, device)


In [None]:
import pandas as pd
#Try different configurations using weight Initialization, learning rate scheduling, early stopping, no translation during tokenization
configs = [
    {"d_model": 128, "n_heads": 4, "n_layers": 4, "lr": 0.0005, "batch_size": 16, "epochs": 500, "early_stop_patience": 10},
    {"d_model": 128, "n_heads": 2, "n_layers": 4, "lr": 0.0001, "batch_size": 16, "epochs": 250, "early_stop_patience": 10},
    {"d_model": 128, "n_heads": 4, "n_layers": 8, "lr": 0.0005, "batch_size": 16, "epochs": 100, "early_stop_patience": 10},
    {"d_model": 128, "n_heads": 8, "n_layers": 2, "lr": 0.0005, "batch_size": 16, "epochs": 50, "early_stop_patience": 10},
    {"d_model": 256, "n_heads": 8, "n_layers": 4, "lr": 0.0005, "batch_size": 16, "epochs": 50,  "early_stop_patience": 10},
    {"d_model": 256, "n_heads": 8, "n_layers": 8, "lr": 0.0001, "batch_size": 16, "epochs": 250,  "early_stop_patience": 10},
    {"d_model": 256, "n_heads": 8, "n_layers": 8, "lr": 0.0001, "batch_size": 16, "epochs": 500,  "early_stop_patience": 10},
    {"d_model": 128, "n_heads": 16, "n_layers": 8, "lr": 0.0005, "batch_size": 16, "epochs": 50, "early_stop_patience": 10},
    {"d_model": 128, "n_heads": 16, "n_layers": 4, "lr": 0.0005, "batch_size": 16, "epochs": 50, "early_stop_patience": 10},
    {"d_model": 256, "n_heads": 8, "n_layers": 4, "lr": 0.0001, "batch_size": 16, "epochs": 50, "early_stop_patience": 10},
]

configs += [
    {"d_model": 512, "n_heads": 8,  "n_layers": 6,  "lr": 0.0005, "batch_size": 16, "epochs": 100, "early_stop_patience": 5},
    {"d_model": 512, "n_heads": 16, "n_layers": 8,  "lr": 0.0003, "batch_size": 16, "epochs": 100, "early_stop_patience": 5},
    {"d_model": 512, "n_heads": 16, "n_layers": 12, "lr": 0.0002, "batch_size": 16, "epochs": 200, "early_stop_patience": 10},
    {"d_model": 768, "n_heads": 12, "n_layers": 12, "lr": 0.0001, "batch_size": 8,  "epochs": 200, "early_stop_patience": 10},
    {"d_model": 1024,"n_heads": 16, "n_layers": 12, "lr": 0.0001, "batch_size": 8,  "epochs": 250, "early_stop_patience": 10},
    {"d_model": 1024,"n_heads": 16, "n_layers": 16, "lr": 0.00005,"batch_size": 4,  "epochs": 300, "early_stop_patience": 12},
    {"d_model": 1024,"n_heads": 32, "n_layers": 16, "lr": 0.00003,"batch_size": 4,  "epochs": 500, "early_stop_patience": 20},
    {"d_model": 1536,"n_heads": 24, "n_layers": 18, "lr": 0.00003,"batch_size": 4,  "epochs": 500, "early_stop_patience": 20},
    {"d_model": 2048,"n_heads": 32, "n_layers": 24, "lr": 0.00002,"batch_size": 2,  "epochs": 500, "early_stop_patience": 25},
    {"d_model": 2048,"n_heads": 32, "n_layers": 32, "lr": 0.00001,"batch_size": 2,  "epochs": 1000,"early_stop_patience": 30},
    {"d_model": 1024,"n_heads": 8,  "n_layers": 24, "lr": 0.00005,"batch_size": 8,  "epochs": 500, "early_stop_patience": 15},
    {"d_model": 1536,"n_heads": 16, "n_layers": 20, "lr": 0.00002,"batch_size": 4,  "epochs": 600, "early_stop_patience": 20},
    {"d_model": 512, "n_heads": 8,  "n_layers": 16, "lr": 0.0002, "batch_size": 8,  "epochs": 300, "early_stop_patience": 8},
    {"d_model": 1024,"n_heads": 16, "n_layers": 20, "lr": 0.00002,"batch_size": 4,  "epochs": 600, "early_stop_patience": 25},
]

df_configs = pd.DataFrame(configs, columns=["d_model", "n_heads", "n_layers", "lr", "batch_size", "epochs", "early_stop_patience"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=hparams["batch_size"], shuffle=False, collate_fn=collate)

results= []

for idx, hparams in enumerate(configs):
    print(f"\n🔁 Testing Model {idx+1} with config: {hparams}")
    model = DecoderOnlyTransformer(
        vocab_size=VOCAB_SIZE,
        d_model=hparams["d_model"],
        n_heads=hparams["n_heads"],
        n_layers=hparams["n_layers"],
    ).to(device)
    train_loss, val_loss, accuracy = train_transformer_change(model, train_loader, val_loader, hparams, device, max_gen_len=64)
    values= plot_evaluation_search(train_loss, val_loss, accuracy, model, test_dataset, device)
    # metrics = { "exact_match": results ["exact_match"], "validator_pass": results["val_rate"]}
    # return metrics

    acc = values["exact_match"]
    validator = values["validator_pass"]
    print(f"🧠 Exact Match: {acc:.4f}%")
    print(f"🧠 Validator percentage: {validator:.4f}%")

    # Save results
    results.append({
        **hparams.to_dict(),
        "Exact Match": acc,
        "Validator Percentage": validator
    })

results_df = pd.DataFrame(results)
results_df.to_csv("NoTranslation_grid_search_results.csv", index=False)
print("\n✅ Grid search complete. Results saved to grid_search_results.csv.")

