In [1]:
%%capture
# Installs Unsloth, Xformers (for speed), and TRL (for training loops)
!pip install unsloth
# Also install the latest nightly version for GGUF export support
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
import torch

# 1. Configuration for Mobile Speed
max_seq_length = 4096  # We double the default to read long docs
dtype = None           # Auto-detect best precision
load_in_4bit = True    # 4-bit compression (Required for Mobile GGUF)

# 2. Load the Base Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # <--- The Mobile Champion
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. Add the "Learner" Adapters (LoRA)
# We don't retrain the whole brain, just these specific "smart" layers.
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # The "rank". 16 is perfect for mobile (small update file).
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Optimized for 0
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Saves VRAM
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("✅ Model successfully loaded and configured for Barq!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.9: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2025.12.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


✅ Model successfully loaded and configured for Barq!


In [3]:
from datasets import load_dataset

# 1. Load the file
dataset = load_dataset("json", data_files="barq_universal_finetune.jsonl", split="train")

# 2. Define the Qwen Chat Format
# This tells the model: "Here starts the user, here ends the assistant."
chat_template = """<|im_start|>system
{system}<|im_end|>
<|im_start|>user
{user}<|im_end|>
<|im_start|>assistant
{assistant}<|im_end|>"""

# 3. The Formatting Function
def formatting_prompts_func(examples):
    conversations = examples["conversations"]
    texts = []
    for convo in conversations:
        # Extract the parts from your JSONL
        system_msg = convo[0]["content"]
        user_msg = convo[1]["content"]
        assistant_msg = convo[2]["content"] # This includes the <think> block

        # Combine them into one training string
        text = chat_template.format(
            system=system_msg,
            user=user_msg,
            assistant=assistant_msg
        )
        # Add the "End of String" token so the model knows when to stop
        texts.append(text + tokenizer.eos_token)

    return { "text" : texts }

# 4. Apply the format to all 200 examples
dataset = dataset.map(formatting_prompts_func, batched = True)

# 5. Verify the first example
print("\n🔍 First Training Example Look:")
print("="*60)
print(dataset[0]["text"][:500] + "...") # Print first 500 chars

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]


🔍 First Training Example Look:
<|im_start|>system
أنت 'برق'، مساعد عربي ذكي.<|im_end|>
<|im_start|>user
السياق:
Source URL: https://my.gov.sa/ar/services/18881
# طلب تصاريح السفر خارجياً لغرض الاستثمار
وزارة التجارة
خدمة إلكترونية تقدمها وزارة التجارة، تتيح للمستفيدين من تقديم طلب تصريح سفر لأغراض تجارية إلى الدول الممنوع السفر لها، إلكترونيًا دون الحاجة إلى مراجعة فروع الوزارة.
مشاركة الصفحة
مشاركة عبر إكس
مشاركة عبر لينكدإن
مشاركة عبر فيسبوك
مشاركة عبر واتساب
مشاركة عبر البريد
* المتطلبات
* المستندات المطلوبة
* الدخول الى ا...


In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import psutil
import builtins

# 🔴 THE FIX: Force psutil to be visible inside the hidden Unsloth files
builtins.psutil = psutil

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# 🚀 START TRAINING
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/200 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 3 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 17,432,576 of 1,738,007,552 (1.00% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.5921
2,2.7884
3,2.5132
4,2.389
5,2.4159
6,2.2938
7,2.0643
8,1.87
9,1.9423
10,2.0904




0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train/grad_norm,█▇▅▅▄▃▃▃▂▂▃▂▃▂▂▁▂▂▁▂▂▁▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▂▄▅▇███▇▇▇▇▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁
train/loss,▇█▇▇▇▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▂▁▂▂▂▂▁▁▂▂▂▁▁▁▂▁▁

0,1
total_flos,3590335557918720.0
train/epoch,2.4
train/global_step,60.0
train/grad_norm,0.31259
train/learning_rate,0.0
train/loss,0.9404
train_loss,1.37515
train_runtime,309.2937
train_samples_per_second,1.552
train_steps_per_second,0.194


In [6]:
# 1. Switch to Inference
FastLanguageModel.for_inference(model)

# 2. The "Fake" Search Result (Context)
# We give it a specific fine amount (e.g., 900 SAR) to see if it repeats it.
context_snippet = """
المادة 45: يعاقب كل من يقود مركبة بدون رخصة قيادة بغرامة مالية لا تقل عن 1000 ريال ولا تزيد عن 2000 ريال، أو بالسجن مدة لا تزيد عن 10 أيام.
"""

# 3. The User Question
question = "كم مخالفة السواقة بدون رخصة؟"

# 4. The Prompt (Mimicking your App)
# We tell the AI: "Here is the context. Use it."
prompt_text = f"""استخدم النص التالي للإجابة على السؤال بدقة.
النص:
{context_snippet}

السؤال:
{question}"""

messages = [
    {"role": "system", "content": "You are a legal assistant. Use the provided context to answer the user."},
    {"role": "user", "content": prompt_text},
]

# 5. Generate
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(
    input_ids = inputs,
    max_new_tokens = 1024,
    use_cache = True,
    temperature = 0.5,
)

# 6. Result
response = tokenizer.batch_decode(outputs)
print(response[0].split("<|im_start|>assistant")[-1])


<think> [Analyze User Intent] The user is asking about the penalty for driving a vehicle without a license, specifically the number of violations. [Check Related Context] The context provides a relevant article: 'المادة 45: يعاقب كل من يقود مركبة بدون رخصة قيادة بغرامة مالية لا تقل عن 1000 ريال ولا تزيد عن 2000 ريال، أو بالسجن مدة لا تزيد عن 10 أيام.' This directly answers the question about the penalty, mentioning a fine of 1000 to 2000 ريال. [Think About Answer] Based on the context, the answer is clear and matches the user's request without needing additional information.</think>
المخالفة السواقة بدون رخصة تُعاقب بغرامة مالية لا تقل عن 1000 ريال ولا تزيد عن 2000 ريال، أو بالسجن مدة لا تزيد عن 10 أيام.<|im_end|>


In [7]:
FastLanguageModel.for_inference(model)

def run_test(test_name, context, question):
    print(f"\n🔬 TEST: {test_name}")
    print("="*40)

    # Construct Prompt
    if context:
        prompt = f"استخدم النص التالي للإجابة:\n{context}\n\nالسؤال:\n{question}"
    else:
        prompt = question # No context for general questions

    messages = [
        {"role": "system", "content": "You are a helpful assistant attempting to answer the user's question with reasoning steps. Use arabic for answers."},
        {"role": "user", "content": prompt},
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=1024,
        use_cache=True,
        temperature=0.3 # Low temp for precision
    )

    # Clean Output
    raw_response = tokenizer.batch_decode(outputs)[0]
    answer = raw_response.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "")

    print(f"❓ Q: {question}")
    print(f"🤖 A: {answer.strip()}\n")

# --- EXECUTE TESTS ---

# TEST 1: The "Red Hat" Test (Does it obey weird context?)
fake_law = """
المادة 999: يمنع ارتداء القبعات الحمراء أثناء القيادة، ويعاقب المخالف بغرامة قدرها 50,000 ريال وسحب السيارة.
"""
run_test(
    "Context Obedience (The Red Hat)",
    fake_law,
    "كم غرامة لبس القبعة الحمراء وأنا أسوق؟"
)

# TEST 2: The "Double Fine" Test (Can it add numbers?)
math_context = """
مخالفة السرعة: 500 ريال.
مخالفة عدم حمل رخصة: 1000 ريال.
مخالفة التظليل: 300 ريال.
"""
run_test(
    "Reasoning (Math)",
    math_context,
    "صادني المرور مسرع وما معي رخصة. كم بياخذون مني المجموع؟"
)

# TEST 3: The "Green Tea" Test (Did we break its brain?)
run_test(
    "General Knowledge (Sanity Check)",
    None,
    "كيف أسوي شاي أخضر؟"
)


🔬 TEST: Context Obedience (The Red Hat)
❓ Q: كم غرامة لبس القبعة الحمراء وأنا أسوق؟
🤖 A: <think> [Analyze User Intent] The user is asking about the fine for wearing a red helmet while driving, specifically asking "كم غرامة لبس القبعة الحمراء وأنا أسوق؟" which translates to "How much is the fine for wearing a red helmet while driving?". This is a direct question about a specific legal consequence from the provided text. [Check Related Information] The relevant text states: 'المادة 999: يمنع ارتداء القبعات الحمراء أثناء القيادة، ويعاقب المخالف بغرامة قدرها 50,000 ريال وسحب السيارة.' This explicitly mentions that the penalty is 50,000 ريال and the vehicle is suspended. [Think About Answer] Based on the text, the answer is clearly stated as 50,000 ريال. No additional information or reasoning is needed. The final answer should be in Arabic, as per the user's instruction.</think>
الغرامة هي 50,000 ريال.


🔬 TEST: Reasoning (Math)
❓ Q: صادني المرور مسرع وما معي رخصة. كم بياخذون مني المجموع؟


In [8]:
# Save to GGUF (Mobile Format)
model.save_pretrained_gguf(
    "barq_mobile_v1",
    tokenizer,
    quantization_method = "q4_k_m"
)

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:55<00:00, 55.27s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:54<00:00, 54.41s/it]


Unsloth: Merge process complete. Saved to `/content/barq_mobile_v1`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['qwen3-1.7b.F16.gguf']
Unsloth: [2] C

{'save_directory': 'barq_mobile_v1',
 'gguf_files': ['qwen3-1.7b.Q4_K_M.gguf'],
 'modelfile_location': '/content/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}

In [9]:
%%capture
# We need to install llama-cpp-python with CUDA (GPU) support
# otherwise it will run on CPU and be very slow.
!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python

In [10]:
from llama_cpp import Llama

# 1. Load the GGUF model
# n_gpu_layers=-1 means "put the whole model on the GPU"
print("⏳ Loading GGUF Model...")
llm = Llama(
    model_path="/content/qwen3-1.7b.Q4_K_M.gguf",
    n_gpu_layers=-1,
    n_ctx=4096,      # Match your training context
    verbose=False    # Set to True if you want to see the Matrix computations
)
print("✅ Model Loaded!")

def run_gguf_test(test_name, context, question):
    print(f"\n📱 GGUF TEST: {test_name}")
    print("="*40)

    # 2. Format the Prompt (Manually, since we don't have the tokenizer here)
    # Qwen uses ChatML format: <|im_start|>role...<|im_end|>
    if context:
        user_msg = f"استخدم النص التالي للإجابة:\n{context}\n\nالسؤال:\n{question}"
    else:
        user_msg = question

    prompt = (
        f"<|im_start|>system\nYou are a helpful assistant. Answer in Arabic.<|im_end|>\n"
        f"<|im_start|>user\n{user_msg}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    # 3. Generate Answer
    output = llm(
        prompt,
        max_tokens=512,  # Limit output length
        stop=["<|im_end|>"], # Stop when it finishes talking
        temperature=0.3, # Low temp for factual answers
        echo=False
    )

    response_text = output['choices'][0]['text']
    print(f"❓ Q: {question}")
    print(f"🤖 A: {response_text.strip()}\n")

# --- RUN THE TESTS ON THE MOBILE FILE ---

# Test 1: The Critical "Red Hat" Logic
fake_law = """
المادة 999: يمنع ارتداء القبعات الحمراء أثناء القيادة، ويعاقب المخالف بغرامة قدرها 50,000 ريال وسحب السيارة.
"""
run_gguf_test(
    "Context Obedience (Red Hat)",
    fake_law,
    "كم غرامة لبس القبعة الحمراء؟"
)

# Test 2: General Traffic Question
run_gguf_test(
    "General Knowledge",
    None,
    "ما هي عقوبة قطع الإشارة؟"
)

⏳ Loading GGUF Model...


llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized


✅ Model Loaded!

📱 GGUF TEST: Context Obedience (Red Hat)
❓ Q: كم غرامة لبس القبعة الحمراء؟
🤖 A: <think> [Analyze User Intent] The user is asking about the fine for wearing a red cap while driving. [Check Related Content] The relevant content states: "يعاقب المخالف بغرامة قدرها 50,000 ريال وسحب السيارة." This directly answers the question. [Think About Answer] The answer is 50,000 ريال. I will provide this in Arabic as requested.
</think>
الغرامة لبس القبعة الحمراء هي 50,000 ريال.


📱 GGUF TEST: General Knowledge
❓ Q: ما هي عقوبة قطع الإشارة؟
🤖 A: <think>
Okay, the user is asking about the punishment for cutting the signal in Arabic. First, I need to recall the relevant laws. In Egypt, according to Article 318 of the Penal Code, cutting the signal is a criminal offense. The punishment depends on the severity of the act. If it's a minor offense, like a single incident, the penalty is up to 3 months. If it's a repeat offense or involves a public place, the punishment can be up to 6 month

In [11]:
def run_context_test(test_name, context_text, question):
    print(f"\n⚖️ TEST: {test_name}")
    print("="*60)

    # --- SCENARIO 1: NO CONTEXT (The "Bad Grammar" Trap) ---
    print(f"🔻 SCENARIO 1: Asking without Context")
    prompt_no_ctx = (
        f"<|im_start|>system\nYou are a helpful assistant. Answer in Arabic.<|im_end|>\n"
        f"<|im_start|>user\n{question}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    output_no_ctx = llm(
        prompt_no_ctx, max_tokens=256, stop=["<|im_end|>"], temperature=0.3, echo=False
    )
    print(f"🤖 Answer: {output_no_ctx['choices'][0]['text'].strip()}")
    print("-" * 30)

    # --- SCENARIO 2: WITH CONTEXT (The Correct Way) ---
    print(f"✅ SCENARIO 2: Asking WITH Context")
    # We provide a high-quality Arabic snippet. The model should mimic this style.
    prompt_ctx = (
        f"<|im_start|>system\nYou are a helpful assistant. Use the provided text to answer in Arabic.<|im_end|>\n"
        f"<|im_start|>user\nاستخدم النص التالي للإجابة:\n{context_text}\n\nالسؤال:\n{question}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    output_ctx = llm(
        prompt_ctx, max_tokens=256, stop=["<|im_end|>"], temperature=0.3, echo=False
    )
    print(f"🤖 Answer: {output_ctx['choices'][0]['text'].strip()}")
    print("="*60)

# --- EXECUTE COMPARISON ---

# 1. Grammar Check: Does context fix the "The fine wearing" phrasing?
# We give it a snippet with proper grammar ("يعاقب بغرامة...").
law_snippet = """
المادة الخامسة: يُعاقب كل من يرتدي قبعة حمراء أثناء القيادة بغرامة مالية قدرها 50,000 ريال، مع حجز المركبة لمدة أسبوع.
"""
run_context_test(
    "Grammar & Phrasing Fix",
    law_snippet,
    "كم غرامة لبس القبعة الحمراء؟"
)

# 2. Logic Check: Does context enable it to handle specific rules?
tinting_snippet = """
المادة 20: يسمح بتظليل الزجاج الجانبي الخلفي بنسبة لا تزيد عن 30% (0.1). يمنع تظليل الزجاج الأمامي أو الجانبي الأمامي تماماً.
"""
run_context_test(
    "Specific Rule Logic",
    tinting_snippet,
    "هل مسموح أظلل القزاز اللي قدام؟"
)


⚖️ TEST: Grammar & Phrasing Fix
🔻 SCENARIO 1: Asking without Context
🤖 Answer: <think>
Okay, the user is asking about the penalty for wearing the red cap. I need to recall the specific rules from the Quran. Let me think... In Surah Al-Baqarah, verse 219, there's a mention of a penalty for wearing the red cap. The verse states that if someone wears it, they will be punished with a penalty, and if they don't, they will be given a reward. The exact amount isn't specified numerically, but it's mentioned as a penalty. I should make sure to mention that the penalty is not a fixed number but a general term, and that it's part of the Islamic teachings on modesty and dress code. Also, note that this applies to those who wear it without permission, and that the reward is for those who don't. I need to present this information clearly and concisely in Arabic, using appropriate terminology.
</think>

الغرامة لبس القبعة الحمراء تُحدد في القرآن الكريم، وفقًا لـ سورة البقرة، آية 219، حيث ذُكِر أنَّ 

In [13]:
from unsloth import FastLanguageModel

# 1. Define the base model name (Must match what you trained on)
base_model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit" # ⚠️ Verify this matches your finetune_barq start!

# 2. Load the original model into a separate variable
print(f"⏳ Loading original base model: {base_model_name}...")
model_orig, tokenizer_orig = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = 4096, # Match your training context
    dtype = None,
    load_in_4bit = True,   # Keep it small
)

# 3. Enable inference mode for the original model
FastLanguageModel.for_inference(model_orig)
print("✅ Original model loaded as 'model_orig'")

⏳ Loading original base model: unsloth/Qwen3-1.7B-unsloth-bnb-4bit...
==((====))==  Unsloth 2025.12.9: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Original model loaded as 'model_orig'


In [14]:
def run_comparison(test_name, context, question):
    print(f"\n🔬 TEST: {test_name}")
    print("="*60)

    # Construct Prompt
    if context:
        prompt = f"استخدم النص التالي للإجابة:\n{context}\n\nالسؤال:\n{question}"
    else:
        prompt = question

    messages = [
        {"role": "system", "content": "You are a helpful assistant attempting to answer the user's question with reasoning steps. Use arabic for answers."},
        {"role": "user", "content": prompt},
    ]

    # Prepare inputs (We can use the same tokenizer for both since they are the same family)
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    # --- INTERNAL HELPER TO GENERATE ANSWER ---
    def get_answer(target_model, label):
        outputs = target_model.generate(
            input_ids=inputs,
            max_new_tokens=1024,
            use_cache=True,
            temperature=0.3
        )
        raw_response = tokenizer.batch_decode(outputs)[0]
        # Clean up the special tokens
        answer = raw_response.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip()
        print(f"\n🤖 [{label}]:\n{answer}")

    # --- EXECUTE ---
    print(f"❓ Q: {question}")

    # 1. Ask the Original Model
    get_answer(model_orig, "ORIGINAL BASE MODEL")

    # 2. Ask Your Fine-Tuned Model
    get_answer(model, "FINE-TUNED BARQ")
    print("-" * 60)

# --- RUN THE TESTS ---

# TEST 1: The "Red Hat" Test (Context Obedience)
fake_law = """
المادة 999: يمنع ارتداء القبعات الحمراء أثناء القيادة، ويعاقب المخالف بغرامة قدرها 50,000 ريال وسحب السيارة.
"""
run_comparison(
    "Context Obedience (The Red Hat)",
    fake_law,
    "كم غرامة لبس القبعة الحمراء وأنا أسوق؟"
)

# TEST 2: The "Double Fine" Test (Reasoning/Math)
math_context = """
مخالفة السرعة: 500 ريال.
مخالفة عدم حمل رخصة: 1000 ريال.
مخالفة التظليل: 300 ريال.
"""
run_comparison(
    "Reasoning (Math)",
    math_context,
    "صادني المرور مسرع وما معي رخصة. كم بياخذون مني المجموع؟"
)

# TEST 3: The "Green Tea" Test (General Knowledge Preservation)
run_comparison(
    "General Knowledge (Sanity Check)",
    None,
    "كيف أسوي شاي أخضر؟"
)


🔬 TEST: Context Obedience (The Red Hat)
❓ Q: كم غرامة لبس القبعة الحمراء وأنا أسوق؟

🤖 [ORIGINAL BASE MODEL]:
<think>
Okay, let's see. The user is asking about the fine for wearing a red cap while driving, and they're asking "كم غرامة لبس القبعة الحمراء وأنا أسوق؟" which translates to "How much is the fine for wearing a red cap while driving? I'm speeding?"

First, I need to refer to the provided article. The relevant part is Article 999, which states that wearing red caps during driving is prohibited and the violator will be fined 50,000 ريال and have their vehicle's driver's license suspended.

Wait, the question is about the fine for wearing a red cap while driving, but the user mentions "أنا أسوق" which is "I'm speeding." So the user is confused because the original article talks about wearing a red cap, not speeding. But the question is about the fine for wearing a red cap while driving, and the user is asking about the fine when they're speeding.

Hmm, maybe there's a translatio