In [17]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers>=4.39.0
!pip install datasets
!pip install accelerate
!pip install peft>=0.10.0
!pip install trl>=0.8.6
!pip install bitsandbytes>=0.43.0
!pip install sentencepiece
!pip install safetensors
!pip install scipy

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting scipy
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl (41.3 MB)
   ---------------------------------------- 0.0/41.3 MB ? eta -:--:--
   ---------------------------------------- 0.3/41.3 MB ? eta -:--:--
    --------------------------------------- 0.8/41.3 MB 2.2 MB/s eta 0:00:19
   - -------------------------------------- 1.6/41.3 MB 3.0 MB/s eta 0:00:14
   -- ------------------------------------- 2.4/41.3 MB 3.4 MB/s eta 0:00:12
   --- ------------------------------------ 3.1/41.3 MB 3.5 MB/s eta 0:00:11
   ---- ----------------------------------- 4.2/41.3 MB 3.6 MB/s eta 0:00:11
   ---- ----------------------------------- 4.7/41.3 MB 3.8 MB/s eta 0:00:10
   ----- ---------------------------------- 5.5/41.3 MB 3.6 MB/s eta 0:00:10
   ------ --------------------------------- 6.3/41.3 MB 3.7 MB/s eta 0:00:10
   ------ ---------------------------

In [1]:
import torch
print("Torch:", torch.__version__)
print("CUDA:", torch.version.cuda)
print("Capability:", torch.cuda.get_device_capability())

Torch: 2.12.0.dev20260221+cu128
CUDA: 12.8
Capability: (12, 0)


In [7]:
import os

# Disable ALL fused kernels
os.environ["UNSLOTH_USE_XFORMERS"] = "0"
os.environ["UNSLOTH_FORCE_DISABLE_COMPILE"] = "1"
os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"
os.environ["FLASH_ATTENTION_FORCE_DISABLE"] = "1"
os.environ["ACCELERATE_USE_SDPA"] = "true"

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import os

  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.12.0.dev20260221+cu128 for torchao version 0.16.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
W0221 20:31:22.091000 19848 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
BASE_DIR = os.getcwd()
INPUT_PATH = os.path.join(BASE_DIR, "..", "datasets", "L1_dataset.jsonl")
ADAPTER_PATH = os.path.join(BASE_DIR, "..", "adapters", "aegis_L1")

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
max_seq_length = 2048

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # Ensure padding matches EOS
tokenizer.padding_side = "right"

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa",
)

`torch_dtype` is deprecated! Use `dtype` instead!
  torch._check_is_size(blocksize)
Loading checkpoint shards: 100%|██████████| 4/4 [00:23<00:00,  5.76s/it]


In [6]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj","k_proj","v_proj","o_proj", "gate_proj","up_proj","down_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [7]:
model = get_peft_model(model, lora_config)

In [8]:
dataset = load_dataset("json", data_files=INPUT_PATH, split="train")

In [9]:
system_prompt = "You are the L1 Intent Layer for Mini Replit. Output strict JSON enums only."

In [10]:
def add_system_and_format(example):
    messages = example["messages"]
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": system_prompt})
    
    # ✅ FIX 2: Apply chat template here to ensure the model learns the exact stop tokens
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": text}

dataset = dataset.map(add_system_and_format)

Map: 100%|██████████| 1001/1001 [00:00<00:00, 6128.27 examples/s]


In [12]:
sft_config = SFTConfig(
    output_dir="outputs",
    max_length=max_seq_length,
    dataset_text_field="text", # ✅ FIX 3: Explicitly tell trainer to use our formatted string
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    fp16=False,
    optim="adamw_8bit",
    logging_steps=1,
)

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
)

Tokenizing train dataset: 100%|██████████| 1001/1001 [00:00<00:00, 1074.13 examples/s]
Truncating train dataset: 100%|██████████| 1001/1001 [00:00<00:00, 112656.93 examples/s]


In [14]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
1,13.3433
2,12.8565
3,11.5717
4,15.7443
5,12.3841
6,11.1249
7,10.2617
8,10.3245
9,9.4738
10,8.6217


TrainOutput(global_step=189, training_loss=3.3561217677656305, metrics={'train_runtime': 640.7059, 'train_samples_per_second': 4.687, 'train_steps_per_second': 0.295, 'total_flos': 1.48339769448192e+16, 'train_loss': 3.3561217677656305, 'epoch': 3.0})

In [15]:
trainer.save_model(ADAPTER_PATH)
tokenizer.save_pretrained(ADAPTER_PATH)
print(f"✅ Adapter saved to {ADAPTER_PATH}")

✅ Adapter saved to d:\Python\AegisFlow-\snippets\..\adapters\aegis_L1


In [16]:
from transformers import TextStreamer

# 2. Prepare Model for Inference
model.eval()
# ❌ REMOVED: model.to(torch.bfloat16) 
# Why? The model is already loaded in 4-bit with bfloat16 compute type via your bnb_config. 
# Manually moving it again can break the quantization hooks.

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
       

In [19]:
model.to(torch.bfloat16) # Ensures all weights are viewed through the BF16 lens

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
       

In [20]:
def generate_intent(user_prompt):
    messages = [
        {"role": "user", "content": user_prompt}
    ]
    
    # Apply Chat Template
    inputs = tokenizer.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to("cuda")

    # ❌ REMOVED: inputs = inputs.to(dtype=torch.bfloat16)
    # Why? 'inputs' are indices (Integers). Converting them to Float/BFloat breaks the Embedding Layer.

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs, 
            max_new_tokens=512, 
            temperature=0.1, 
            do_sample=True, 
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and Slice
    decoded_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return decoded_output

In [21]:
print("--- TEST 1: Standard Portfolio (Should Accept) ---")
prompt_1 = "Create a dark mode portfolio for a wedding photographer with gallery and contact."
print(f"Input: {prompt_1}")
print(f"Output:\n{generate_intent(prompt_1)}\n")

print("--- TEST 2: Messy Input (Should Fix & Normalize) ---")
prompt_2 = "i want a vibrant site for my bubble tea shop in Kovilpatti... playful tone.. need menu and location"
print(f"Input: {prompt_2}")
print(f"Output:\n{generate_intent(prompt_2)}\n")

print("--- TEST 3: Scope Violation (Should Reject) ---")
prompt_3 = "Build me a fully functional e-commerce store with user login and a payment gateway using Python."
print(f"Input: {prompt_3}")
print(f"Output:\n{generate_intent(prompt_3)}\n")

--- TEST 1: Standard Portfolio (Should Accept) ---
Input: Create a dark mode portfolio for a wedding photographer with gallery and contact.


  torch._check_is_size(blocksize)


Output:
{"project_type":"blog_page","theme":"dark_mode","domain":"art_shop","tone":"modern","audience":"rec_ent","explicit_sections":[],"error":null}

--- TEST 2: Messy Input (Should Fix & Normalize) ---
Input: i want a vibrant site for my bubble tea shop in Kovilpatti... playful tone.. need menu and location
Output:
{"project_type":"landing","theme":"minimal","domain":"professionalaiastsersech","audience":"clientsexplicit","explicit_sections":[],"error":null}

--- TEST 3: Scope Violation (Should Reject) ---
Input: Build me a fully functional e-commerce store with user login and a payment gateway using Python.
Output:
{"project_type": "landing_page", "theme": "light_mode", "domain": "social", "tone": "modern", "audience": "general", "explicit_sections": [], "error": "scope_violation"}

