In [None]:
! pip install -q transformers peft datasets bitsandbytes accelerate gradio

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from huggingface_hub import login
login("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")

In [3]:
import torch, platform
print("‚úÖ Torch:", torch.__version__)
print("üíª GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print("üêç Python:", platform.python_version())


‚úÖ Torch: 2.8.0+cu126
üíª GPU: Tesla T4
üêç Python: 3.12.12


In [None]:
# ------------------------------------------------------------
# üß† QLoRA Fine-Tuning Script ‚Äì Llama-3-8B-Instruct
# ------------------------------------------------------------
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
OUTPUT_DIR = "./lora-weights"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("üîπ Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# LoRA setup
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Small public dataset (you can replace later)
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:1%]")


def format(example):
    return {"text": f"<|user|>: {example['instruction']}\n<|assistant|>: {example['output']}"}
dataset = dataset.map(format)

def tokenize_fn(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset.map(tokenize_fn, batched=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer
)

print("üöÄ Starting fine-tuning...")
trainer.train()

print("üíæ Saving LoRA adapter...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úÖ LoRA adapter saved to:", OUTPUT_DIR)


In [12]:
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
LORA_PATH = "./lora-weights"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("üîπ Loading base model...")
base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
    offload_folder="./offload",  # ‚úÖ add this line
)

print("üîπ Loading LoRA adapter...")
model = PeftModel.from_pretrained(
    base,
    LORA_PATH,
    device_map="auto",
    offload_folder="./offload"   # ‚úÖ add here too
)

print("üîπ Merging LoRA weights...")
merged_model = model.merge_and_unload()

print("üíæ Saving merged model...")
merged_model.save_pretrained("./merged-llama3")
print("‚úÖ Merged model saved to ./merged-llama3")


üîπ Loading base model...


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

üîπ Loading LoRA adapter...
üîπ Merging LoRA weights...




üíæ Saving merged model...
‚úÖ Merged model saved to ./merged-llama3


In [3]:
from transformers import AutoTokenizer

# Copy tokenizer from base model to merged folder
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
merged_path = "merged-llama3"

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.save_pretrained(merged_path)

print("‚úÖ Tokenizer files copied successfully into:", merged_path)
!ls {merged_path}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

‚úÖ Tokenizer files copied successfully into: merged-llama3
chat_template.jinja		  model.safetensors.index.json
config.json			  special_tokens_map.json
generation_config.json		  tokenizer_config.json
model-00001-of-00002.safetensors  tokenizer.json
model-00002-of-00002.safetensors


In [None]:
# ------------------------------------------------------------
# ü¶ô Llama-3-8B-Instruct (Merged + Polite & Descriptive Output)
# ------------------------------------------------------------
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re

MODEL_NAME = "merged-llama3"

# ‚úÖ System prompt tuned for polite, natural, short-descriptive answers
SYSTEM_PROMPT = (
    "You are a friendly, polite, and knowledgeable AI assistant. "
    "Always start with a short greeting like 'Hello!' or 'Good day!'. "
    "Provide clear, concise, and slightly descriptive answers (2‚Äì4 lines). "
    "Avoid showing internal roles, system tags, or code. "
    "Focus only on the user's question, keep the response natural and human-like."
)
history_token_total = 0

print("üîπ Loading tokenizer & model ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- 4-bit quantization config (for Tesla T4) -------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16
)

print("‚úÖ Model loaded successfully with 4-bit quantization!")

# ------------------------------------------------------------
# üß† Clean & Chat Function
# ------------------------------------------------------------
def clean_output(text: str) -> str:
    """Remove unwanted role labels and artifacts."""
    text = re.sub(r"(<\|.*?\|>|system|user|assistant|###)", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

def chat_fn(message, history):
    global history_token_total

    if message.strip().lower() in {"exit", "quit"}:
        history.append((message, "üëã Goodbye! Have a great day!"))
        return history, "", 0, 0, history_token_total, 0

    # --- Build chat context -------------------
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for u, a in history:
        messages.append({"role": "user", "content": u})
        messages.append({"role": "assistant", "content": a})
    messages.append({"role": "user", "content": message})

    # --- Tokenize -----------------------------
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    input_tokens = inputs["input_ids"].shape[1]

    # --- Generate polite + descriptive answer ---
    outputs = model.generate(
        **inputs,
        max_new_tokens=180,
        temperature=0.5,       # slightly creative, still controlled
        top_p=0.85,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # --- Decode + clean ------------------------
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = clean_output(reply)

    # Extract assistant portion if present
    if message in reply:
        reply = reply.split(message)[-1].strip()
    if "Assistant:" in reply:
        reply = reply.split("Assistant:")[-1].strip()

    # --- Token tracking ------------------------
    output_ids = tokenizer(reply, return_tensors="pt").input_ids
    output_tokens = output_ids.shape[1]
    history_token_total += input_tokens + output_tokens
    history.append((message, reply))
    total_tokens = input_tokens + output_tokens

    return history, "", input_tokens, output_tokens, history_token_total, total_tokens

# ------------------------------------------------------------
# üí¨ Gradio Interface
# ------------------------------------------------------------
with gr.Blocks(title="ü¶ô Llama-3 QLoRA Chatbot") as demo:
    gr.Markdown("## ü¶ô Llama-3-8B QLoRA Chatbot (Polite & Descriptive Answers)")
    with gr.Row():
        chatbot = gr.Chatbot(height=500)
        txt = gr.Textbox(
            label="üí¨ Type your question and press Enter‚Ä¶",
            placeholder="Ask about AI, tech, or general knowledge..."
        )
    with gr.Row():
        in_tok = gr.Number(label="Input Tokens", value=0, interactive=False)
        out_tok = gr.Number(label="Output Tokens", value=0, interactive=False)
        hist_tok = gr.Number(label="History Tokens", value=0, interactive=False)
        total_tok = gr.Number(label="Total Tokens", value=0, interactive=False)

    txt.submit(
        chat_fn,
        [txt, chatbot],
        [chatbot, txt, in_tok, out_tok, hist_tok, total_tok]
    )

print("üöÄ Launching Gradio App ...")
demo.launch(share=True, debug=True)


üîπ Loading tokenizer & model ...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded successfully with 4-bit quantization!


  chatbot = gr.Chatbot(height=500)


üöÄ Launching Gradio App ...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9cb619bea95ad61de1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
