In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import threading, time

def keep_alive():
    while True:
        print("Still alive...", flush=True)
        time.sleep(600)  # every 10 minutes

# Start background thread
thread = threading.Thread(target=keep_alive, daemon=True)
thread.start()

In [None]:
# ============================================================================
# SECTION 1: Installation and Setup
# ============================================================================

import os
import subprocess
import sys

print("Installing Unsloth and dependencies...")
# subprocess.check_call([
#     sys.executable, "-m", "pip", "install", "unsloth", "unsloth_zoo", "-q"
# ])

# subprocess.check_call([
#     sys.executable, "-m", "pip", "install",
#     "xformers", "trl", "peft", "accelerate", "bitsandbytes", "-q"
# ])



print("Installation complete!\n")

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
# ============================================================================
# SECTION 2: Import Libraries
# ============================================================================

from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import torch
from typing import Dict, List
import json


In [None]:
# ============================================================================
# SECTION 3: Configuration
# ============================================================================

class Config:
    # Model Configuration
    MODEL_NAME = "unsloth/Qwen3-4B-Instruct-2507"
    MAX_SEQ_LENGTH = 2048
    LOAD_IN_4BIT = True

    # LoRA Configuration (memory efficient)
    LORA_R = 16
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.05
    TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"]

    # Training Configuration
    BATCH_SIZE = 2
    GRADIENT_ACCUMULATION_STEPS = 4
    LEARNING_RATE = 2e-4
    NUM_EPOCHS = 3
    WARMUP_STEPS = 50
    MAX_STEPS = -1
    SAVE_STEPS = 100
    LOGGING_STEPS = 10

    # Dataset Configuration
    DATASET_NAME = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
    DATASET_SPLIT = "train"

    # Output Configuration
    OUTPUT_DIR = "./qwen3_4B_customer_support_model"
    GGUF_OUTPUT_DIR = "./qwen3_4B_customer_support_gguf_models"

    # Checkpoint Management (for storage efficiency)
    SAVE_TOTAL_LIMIT = 2  # Keep only 2 best checkpoints
    RESUME_FROM_CHECKPOINT = True  # Auto-resume from last checkpoint

    # HuggingFace Hub Configuration (for backup & resume)
    PUSH_TO_HUB = False  # Set to True to enable
    HF_REPO_NAME = "ragib01/Qwen3-4B-customer-support"  # Change this
    PUSH_TO_HUB_MODEL_ID = None  # Will use HF_REPO_NAME if None

    # Weights & Biases Configuration (for logging & backup)
    USE_WANDB = True  # Set to True to enable
    WANDB_PROJECT = "qwen3-4B-customer-support"
    WANDB_RUN_NAME = None  # Will auto-generate if None

config = Config()

In [None]:
# ============================================================================
# SECTION 4: Setup HuggingFace Hub & Weights & Biases
# ============================================================================

# Initialize Weights & Biases
if config.USE_WANDB:
    try:
        import wandb
        wandb.init(
            project=config.WANDB_PROJECT,
            name=config.WANDB_RUN_NAME,
            config={
                "model": config.MODEL_NAME,
                "lora_r": config.LORA_R,
                "batch_size": config.BATCH_SIZE,
                "learning_rate": config.LEARNING_RATE,
                "epochs": config.NUM_EPOCHS,
            }
        )
        print("✓ Weights & Biases initialized")
        print(f"  Project: {config.WANDB_PROJECT}")
        print(f"  Run: {wandb.run.name}\n")
    except ImportError:
        print("⚠ wandb not installed. Install with: pip install wandb")
        config.USE_WANDB = False
    except Exception as e:
        print(f"⚠ Failed to initialize wandb: {e}")
        config.USE_WANDB = False

# Setup HuggingFace Hub
if config.PUSH_TO_HUB:
    try:
        from huggingface_hub import HfApi, login, create_repo
        print("Setting up HuggingFace Hub...")
        print("Please login to HuggingFace (if not already logged in)")
        # Will use token from HF_TOKEN env var or prompt for login
        try:
            api = HfApi()
            whoami = api.whoami()
            print(f"✓ Logged in as: {whoami['name']}")
        except:
            print("Please login:")
            login()

        # Create repo if it doesn't exist
        try:
            create_repo(config.HF_REPO_NAME, exist_ok=True, private=True)
            print(f"✓ Repository ready: {config.HF_REPO_NAME}\n")
        except Exception as e:
            print(f"⚠ Repository setup: {e}\n")
    except ImportError:
        print("⚠ huggingface_hub not installed. Install with: pip install huggingface_hub")
        config.PUSH_TO_HUB = False

In [None]:
# Check for existing checkpoints to resume from
resume_from_checkpoint = None
if config.RESUME_FROM_CHECKPOINT and os.path.exists(config.OUTPUT_DIR):
    import glob
    checkpoints = glob.glob(os.path.join(config.OUTPUT_DIR, "checkpoint-*"))
    if checkpoints:
        # Sort by modification time, get the latest
        latest_checkpoint = max(checkpoints, key=os.path.getmtime)
        resume_from_checkpoint = latest_checkpoint
        print("="*70)
        print("RESUMING FROM CHECKPOINT")
        print("="*70)
        print(f"Found existing checkpoint: {resume_from_checkpoint}")
        print("Training will continue from where it left off.\n")

In [None]:
# ============================================================================
# SECTION 5: Load Model with Memory Efficiency
# ============================================================================

print("Loading model with 4-bit quantization for memory efficiency...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=config.MODEL_NAME,
    max_seq_length=config.MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=config.LOAD_IN_4BIT,
    trust_remote_code=True,
)

print(f"Model loaded: {config.MODEL_NAME}")
print(f"Max sequence length: {config.MAX_SEQ_LENGTH}")
print(f"4-bit quantization: {config.LOAD_IN_4BIT}\n")

In [None]:
# ============================================================================
# SECTION 5: Configure LoRA for Memory-Efficient Training
# ============================================================================

print("Configuring LoRA adapters...")

model = FastLanguageModel.get_peft_model(
    model,
    r=config.LORA_R,
    target_modules=config.TARGET_MODULES,
    lora_alpha=config.LORA_ALPHA,
    lora_dropout=config.LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("LoRA configuration complete!")
print(f"LoRA rank (r): {config.LORA_R}")
print(f"LoRA alpha: {config.LORA_ALPHA}")
print(f"Target modules: {config.TARGET_MODULES}\n")

In [None]:
# ============================================================================
# SECTION 6: Load and Prepare Dataset
# ============================================================================

print(f"Loading dataset: {config.DATASET_NAME}...")

dataset = load_dataset(config.DATASET_NAME, split=config.DATASET_SPLIT)

print(f"Dataset loaded: {len(dataset)} samples")
print(f"Dataset columns: {dataset.column_names}")
print(f"\nSample data:")
print(dataset[0])
print("\n")

In [None]:
# ============================================================================
# SECTION 7: Enhanced Entity Extraction & Tool-Calling Format
# ============================================================================
import re
import random
def generate_realistic_value(placeholder):
    """Generate realistic values for entity placeholders"""
    placeholder_lower = placeholder.lower()

    mappings = {
        'order number': lambda: f"#{random.randint(10000000, 99999999)}",
        'invoice number': lambda: f"INV-{random.randint(1000, 9999)}",
        'tracking number': lambda: f"TRK{random.randint(100000000, 999999999)}",
        'account number': lambda: f"ACC{random.randint(100000, 999999)}",
        'refund amount': lambda: f"${random.randint(10, 500)}.{random.randint(0, 99):02d}",
        'money amount': lambda: f"${random.randint(10, 500)}.{random.randint(0, 99):02d}",
        'date': lambda: f"{random.choice(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'])} {random.randint(1, 28)}, 2024",
        'date range': lambda: f"{random.choice(['Jan', 'Feb', 'Mar'])} {random.randint(1, 15)} - {random.randint(16, 28)}, 2024",
        'email': lambda: f"customer{random.randint(100, 999)}@email.com",
        'phone': lambda: f"+1-{random.randint(200, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
        'client first name': lambda: random.choice(['John', 'Sarah', 'Michael', 'Emma', 'David', 'Lisa']),
        'client last name': lambda: random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia']),
        'delivery city': lambda: random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']),
        'delivery country': lambda: random.choice(['USA', 'Canada', 'UK', 'Australia']),
        'account type': lambda: random.choice(['Premium', 'Standard', 'Pro', 'Basic']),
        'account category': lambda: random.choice(['Gold', 'Silver', 'Bronze', 'Platinum']),
    }

    for key, generator in mappings.items():
        if key in placeholder_lower:
            return generator()

    return f"VALUE{random.randint(1000, 9999)}"

def extract_entities_from_text(text):
    """Extract all {{entity}} placeholders and replace with realistic values"""
    placeholders = re.findall(r'\{\{([^}]+)\}\}', text)
    entity_map = {}

    for placeholder in placeholders:
        if placeholder not in entity_map:
            entity_map[placeholder] = generate_realistic_value(placeholder)

    replaced_text = text
    for placeholder, value in entity_map.items():
        replaced_text = replaced_text.replace(f"{{{{{placeholder}}}}}", value)

    return replaced_text, entity_map

def get_tool_for_intent(intent, category):
    """Determine which tool should be called for a given intent"""
    tool_mapping = {
        'track_order': {
            'name': 'track_order',
            'description': 'Track the status of an order',
            'parameters': ['order_number']
        },
        'cancel_order': {
            'name': 'cancel_order',
            'description': 'Cancel an existing order',
            'parameters': ['order_number']
        },
        'get_invoice': {
            'name': 'get_invoice',
            'description': 'Retrieve invoice details',
            'parameters': ['invoice_number', 'order_number']
        },
        'check_invoice': {
            'name': 'check_invoice',
            'description': 'Check invoice information',
            'parameters': ['invoice_number']
        },
        'track_refund': {
            'name': 'track_refund',
            'description': 'Track refund status',
            'parameters': ['order_number', 'refund_id']
        },
        'get_refund': {
            'name': 'get_refund',
            'description': 'Process a refund request',
            'parameters': ['order_number']
        },
        'check_payment_methods': {
            'name': 'get_payment_methods',
            'description': 'Get available payment methods',
            'parameters': []
        },
        'delivery_options': {
            'name': 'get_delivery_options',
            'description': 'Get available delivery options',
            'parameters': ['location']
        },
        'check_refund_policy': {
            'name': 'get_refund_policy',
            'description': 'Get refund policy details',
            'parameters': []
        },
    }

    return tool_mapping.get(intent, None)

def create_tool_calling_format(instruction, response, intent, category, entity_map):
    """Create training format with tool-calling capability"""

    tool_info = get_tool_for_intent(intent, category)

    # Determine if tool should be called
    needs_tool = tool_info is not None

    if needs_tool:
        # Extract parameters from entity_map
        parameters = {}
        for param in tool_info['parameters']:
            # Try to find matching entity
            for entity_name, entity_value in entity_map.items():
                if param.lower() in entity_name.lower():
                    parameters[param] = entity_value

        # Create tool call format
        tool_call = {
            "name": tool_info['name'],
            "arguments": parameters
        }

        # Format with tool calling
        formatted_text = f"""<|im_start|>system
You are a helpful customer support assistant with access to tools. When users ask about orders, invoices, refunds, or account information, use the appropriate tool to retrieve accurate data. Always extract exact values (like order numbers) from the user's message.<|im_end|>
<|im_start|>user
{instruction}<|im_end|>
<|im_start|>assistant
I'll help you with that. Let me check the information for you.

<tool_call>
{json.dumps(tool_call, indent=2)}
</tool_call>

Based on the information retrieved: {response}<|im_end|>"""

    else:
        # No tool needed, standard response
        formatted_text = f"""<|im_start|>system
You are a helpful customer support assistant. When responding, use the exact values provided by the user (like order numbers, dates, etc.) in your replies.<|im_end|>
<|im_start|>user
{instruction}<|im_end|>
<|im_start|>assistant
{response}<|im_end|>"""

    return formatted_text

def format_dataset_entry(example: Dict) -> Dict:
    """
    Enhanced formatting with:
    1. Entity extraction (replace {{placeholders}})
    2. Tool-calling format
    3. Value preservation training
    """

    instruction = example.get("instruction", "")
    response = example.get("response", "")
    intent = example.get("intent", "")
    category = example.get("category", "")

    # Replace placeholders with realistic values
    instruction_replaced, entity_map = extract_entities_from_text(instruction)

    # Replace placeholders in response using same entity values
    response_replaced = response
    for placeholder, value in entity_map.items():
        response_replaced = response_replaced.replace(f"{{{{{placeholder}}}}}", value)

    # Also replace any remaining placeholders
    response_replaced, _ = extract_entities_from_text(response_replaced)

    # Create format with tool calling awareness
    formatted_text = create_tool_calling_format(
        instruction_replaced,
        response_replaced,
        intent,
        category,
        entity_map
    )

    return {"text": formatted_text}

print("Applying enhanced formatting with entity extraction and tool-calling...")
print("This teaches the model to:")
print("  1. Extract actual values from user input (not placeholders)")
print("  2. Use tool-calling for data retrieval")
print("  3. Preserve exact values in responses\n")

# Apply formatting to dataset
formatted_dataset = dataset.map(
    format_dataset_entry,
    remove_columns=dataset.column_names,
    desc="Formatting dataset with entity extraction and tool-calling"
)

print(f"✓ Dataset formatted: {len(formatted_dataset)} samples\n")

# Show examples of different formats
print("="*70)
print("SAMPLE TRAINING EXAMPLES")
print("="*70)
print("\nExample 1 (with tool calling):")
print(formatted_dataset[0]["text"][:800])
print("\n" + "="*70 + "\n")

# Split dataset for training and validation
dataset_split = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}\n")

In [None]:
# ============================================================================
# SECTION 8: Configure Training Arguments
# ============================================================================

print("Configuring training parameters...")

# Detect number of GPUs available
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs detected: {num_gpus}")

# Adjust batch size for multi-GPU if available
# For multi-GPU, you can increase per_device batch size or keep it same
# Total effective batch size = per_device_batch_size * num_gpus * gradient_accumulation_steps
if num_gpus > 1:
    print(f"Multi-GPU training enabled with {num_gpus} GPUs")
    print(f"Effective batch size: {config.BATCH_SIZE} * {num_gpus} * {config.GRADIENT_ACCUMULATION_STEPS} = {config.BATCH_SIZE * num_gpus * config.GRADIENT_ACCUMULATION_STEPS}")

training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=config.WARMUP_STEPS,
    max_steps=config.MAX_STEPS,
    num_train_epochs=config.NUM_EPOCHS,
    learning_rate=config.LEARNING_RATE,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=config.LOGGING_STEPS,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=3407,
    save_strategy="steps",
    save_steps=config.SAVE_STEPS,
    eval_strategy="steps",
    eval_steps=config.SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # Checkpoint management (save space)
    save_total_limit=config.SAVE_TOTAL_LIMIT,  # Keep only N best checkpoints
    # Logging configuration
    report_to="wandb" if config.USE_WANDB else "none",
    run_name="qwen_customer_support_finetuning",
    # HuggingFace Hub configuration
    push_to_hub=config.PUSH_TO_HUB,
    hub_model_id=config.HF_REPO_NAME if config.PUSH_TO_HUB else None,
    hub_strategy="checkpoint",  # Push every checkpoint
    hub_private_repo=True,
)

print("Training configuration:")
print(f"  Batch size: {config.BATCH_SIZE}")
print(f"  Gradient accumulation: {config.GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective batch size: {config.BATCH_SIZE * config.GRADIENT_ACCUMULATION_STEPS}")
print(f"  Learning rate: {config.LEARNING_RATE}")
print(f"  Epochs: {config.NUM_EPOCHS}")
print(f"  Optimizer: adamw_8bit (memory efficient)")
print(f"  Mixed precision: {'bf16' if torch.cuda.is_bf16_supported() else 'fp16'}\n")

In [None]:
# ============================================================================
# SECTION 9: Initialize Trainer
# ============================================================================

print("Initializing SFT Trainer...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=config.MAX_SEQ_LENGTH,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

print("Trainer initialized successfully!\n")

In [None]:
# ============================================================================
# SECTION 10: Training
# ============================================================================

print("=" * 70)
print("STARTING TRAINING")
print("=" * 70)
print()

# Enable memory efficient training
model.config.use_cache = False

# Train the model (with resume capability)
trainer_stats = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

print("\n" + "=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"\nTraining time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
print(f"Final training loss: {trainer_stats.metrics['train_loss']:.4f}")

# Show checkpoint management info
if config.SAVE_TOTAL_LIMIT:
    checkpoints = glob.glob(os.path.join(config.OUTPUT_DIR, "checkpoint-*"))
    print(f"\nCheckpoint Management:")
    print(f"  Total checkpoints saved: {len(checkpoints)}")
    print(f"  Limit: {config.SAVE_TOTAL_LIMIT} (older checkpoints auto-deleted)")
    if checkpoints:
        print(f"  Latest: {max(checkpoints, key=os.path.getmtime)}")

# Finish Weights & Biases
if config.USE_WANDB:
    try:
        import wandb
        wandb.finish()
        print("\n✓ Weights & Biases session finished")
    except:
        pass

In [None]:
# ============================================================================
# SECTION 11: Save Model
# ============================================================================

print("\n" + "=" * 70)
print("SAVING MODEL")
print("=" * 70)

# Save LoRA adapters
print("\n1. Saving LoRA adapters...")
model.save_pretrained(f"{config.OUTPUT_DIR}/lora_adapters")
tokenizer.save_pretrained(f"{config.OUTPUT_DIR}/lora_adapters")
print(f"   Saved to: {config.OUTPUT_DIR}/lora_adapters")

# Save merged model (16-bit)
print("\n2. Saving merged 16-bit model...")
model.save_pretrained_merged(
    f"{config.OUTPUT_DIR}/merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)
print(f"   Saved to: {config.OUTPUT_DIR}/merged_16bit")

In [None]:
# ============================================================================
# SECTION 12: GGUF Export
# ============================================================================

print("\n" + "=" * 70)
print("EXPORTING TO GGUF FORMAT")
print("=" * 70)

os.makedirs(config.GGUF_OUTPUT_DIR, exist_ok=True)

# Export to various GGUF quantization formats
quantization_methods = [
    "q4_k_m",  # 4-bit quantization, medium quality (recommended for most use cases)
    "q5_k_m",  # 5-bit quantization, medium quality (better quality)
    "q8_0",    # 8-bit quantization (highest quality)
]

print("\nExporting to multiple GGUF quantization formats...\n")

for quant_method in quantization_methods:
    print(f"Exporting to {quant_method.upper()}...")
    try:
        model.save_pretrained_gguf(
            f"{config.GGUF_OUTPUT_DIR}/qwen_customer_support_{quant_method}",
            tokenizer,
            quantization_method=quant_method,
        )
        print(f"  ✓ Successfully exported to {quant_method.upper()}")
        print(f"    Location: {config.GGUF_OUTPUT_DIR}/qwen_customer_support_{quant_method}")
    except Exception as e:
        print(f"  ✗ Failed to export {quant_method.upper()}: {str(e)}")
    print()

In [None]:
from huggingface_hub import HfApi, login
import os

# ==============================================================
# Step 1: Login (if not already done)
# ==============================================================
#login(token="YOUR_HF_TOKEN")  # or run login() interactively

api = HfApi()

# ==============================================================
# Step 2: Define repo info
# ==============================================================
gguf_folder = "/kaggle/working/qwen3_4B_customer_support_gguf_models"
repo_id = "ragib01/Qwen3-4B-customer-support-gguf"  # <- new repo

# ==============================================================
# Step 3: Create repo (only once)
# ==============================================================
api.create_repo(repo_id=repo_id, private=False, exist_ok=True)

# ==============================================================
# Step 4: Upload GGUF files
# ==============================================================
api.upload_folder(
    folder_path=gguf_folder,
    repo_id=repo_id,
    commit_message="Upload Qwen3-4B fine-tuned GGUF quantized model files",
)

print("\n✅ GGUF model uploaded successfully!")
print(f"🔗 View it at: https://huggingface.co/{repo_id}")
print("\nUsers can now download and run with llama.cpp, LM Studio, or Ollama.")


In [None]:
# ============================================================================
# SECTION 13: Inference Test
# ============================================================================

print("=" * 70)
print("TESTING FINE-TUNED MODEL")
print("=" * 70)

# Enable inference mode
FastLanguageModel.for_inference(model)

test_prompts = [
    "How do I track my order?",
    "I want to change my shipping address",
    "What is your return policy?",
    "How can I contact customer support?",
]

print("\nRunning inference tests...\n")

for i, prompt in enumerate(test_prompts, 1):
    print(f"Test {i}/{len(test_prompts)}")
    print(f"Prompt: {prompt}")

    # Format with chat template
    messages = [
        {"role": "system", "content": "You are a helpful customer support assistant."},
        {"role": "user", "content": prompt}
    ]

    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1].strip()

    print(f"Response: {response}")
    print("-" * 70)
    print()

In [None]:
from huggingface_hub import HfApi, login
# Log in (if not already done)
print("\n[4/5] Logging in to HuggingFace...")
login(token="")  # Replace with your actual token or remove for manual login
print("✓ Logged in")

# Define model folder and repo
output_path = "/kaggle/working/qwen3_4B_customer_support_model/merged_16bit"
repo_id = "ragib01/Qwen3-4B-customer-support"

api = HfApi()

print("\n[5/5] Uploading to HuggingFace Hub...")

# (Optional) clear old files
try:
    api.delete_folder(repo_id=repo_id, path_in_repo="")
    print("   Cleared existing files in repo.")
except Exception as e:
    print("   Skipped clearing:", e)

# Upload merged model
api.upload_folder(
    folder_path=output_path,
    repo_id=repo_id,
    commit_message="Upload merged 16-bit model (standard transformers format, no Unsloth needed)"
)

print("\n" + "="*70)
print("✅ UPLOAD COMPLETE!")
print("="*70)
print(f"\nYour model is now available at: https://huggingface.co/{repo_id}")
print("\nLoad it with:")
print('''
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("{repo_id}")
model = AutoModelForCausalLM.from_pretrained("{repo_id}", torch_dtype="auto", device_map="auto")

model.eval()
print(model.generate(**tokenizer("Hello!", return_tensors="pt")))
''')
print("="*70)
