# use those 2 cells if there is a runtime problem with the GPU

In [None]:
# First, clear the current model from memory
torch.cuda.empty_cache()

import gc
gc.collect()

# Check memory freed
!nvidia-smi

In [1]:
from numba import cuda
device = cuda.get_current_device()
device.reset()

# this is our start

In [2]:
# ---------------------------
# CLEAN PROJECT FOLDER SETUP
# ---------------------------
# Why this is here:
# - Every Colab session is clean.
# - We need a stable folder to store data, logs, HITL feedback.

from pathlib import Path

WORKDIR = Path("/content/personalized_chatbot")
WORKDIR.mkdir(exist_ok=True)

DATA_DIR = WORKDIR / "data"
DATA_DIR.mkdir(exist_ok=True)

HITL_FILE = WORKDIR / "feedback.jsonl"      # where Human-in-the-Loop corrections go
FINETUNE_PREP = WORKDIR / "finetune_prep"
FINETUNE_PREP.mkdir(exist_ok=True)

print("WORKDIR:", WORKDIR)
print("DATA_DIR:", DATA_DIR)
print("FINETUNE_PREP:", FINETUNE_PREP)


WORKDIR: /content/personalized_chatbot
DATA_DIR: /content/personalized_chatbot/data
FINETUNE_PREP: /content/personalized_chatbot/finetune_prep


In [3]:
# LLaMA inference (transformers + bitsandbytes)
# LoRA (peft)
# dataset management (datasets)
# orchestration (langchain)
!pip install -q transformers accelerate bitsandbytes peft datasets langchain sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch


# MODEL = 'michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_finetuned' # 3B model
MODEL = 'pierreramez/llama3.1-finetuned-v2' # 8B model

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
                                             MODEL,
                                             load_in_4bit=True, # to enable quantization
                                             device_map='auto', # to automatically map the layers to GPU
                                             torch_dtype=torch.float16,
                                             trust_remote_code=True #required for llama
                                            )

model.config.pad_token_id = tokenizer.pad_token_id

# text gen pipeline
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.2, # low temp to make it more deterministic
    top_p=0.9,
    repetition_penalty=1.2
)

print('Model loaded successfully!\n')

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded successfully!



In [14]:
def generate_reply(user_input, history, max_turns=4):
    """
    history = list of dicts: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
    """
    # Truncate history to last max_turns exchanges
    truncated_history = history[-(max_turns * 2):]

    # Add new user message
    messages = truncated_history + [{"role": "user", "content": user_input}]

    # Apply the SAME chat template used during training
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    attention_mask = (inputs != tokenizer.pad_token_id).long()

    with torch.no_grad():
        output = model.generate(
            input_ids=inputs,
            attention_mask=attention_mask,
            max_new_tokens=128,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode only new tokens
    response = tokenizer.decode(output[0][inputs.shape[1]:], skip_special_tokens=True)
    return response.strip()

In [15]:
import re, json, html

def clean_text(s):
  s = html.unescape(s)
  s= re.sub(r'\s+',' ',s).strip()
  return s

def chat_to_pairs(chat_log, max_user_context=4):
  '''
  chat_log is like:
  [
    {role: 'user', content: '...'},
    {role: 'assistant', content: '...'},
    ...
  ]

  We convert multi-turn chat into supervised training pairs.
  '''

  pairs = []
  for i in range(len(chat_log) - 1):
      if chat_log[i]["role"] == "user" and chat_log[i+1]["role"] == "assistant":
          # Build concise prompt
          ctx_start = max(0, i - max_user_context*2)
          ctx = chat_log[ctx_start:i+1]

          prompt = " ".join(f"{t['role']}: {clean_text(t['content'])}" for t in ctx)
          response = clean_text(chat_log[i+1]["content"])

          pairs.append({"prompt": prompt, "response": response})

  return pairs

example_chat = [
    {"role":"user","content":"Explain normalization."},
    {"role":"assistant","content":"Normalization rescales features to stable ranges."},
    {"role":"user","content":"Show formula."},
    {"role":"assistant","content":"z = (x - µ) / σ"}
]

pairs = chat_to_pairs(example_chat)
pairs

[{'prompt': 'user: Explain normalization.',
  'response': 'Normalization rescales features to stable ranges.'},
 {'prompt': 'user: Explain normalization. assistant: Normalization rescales features to stable ranges. user: Show formula.',
  'response': 'z = (x - µ) / σ'}]

In [16]:
out_path = DATA_DIR / "train_pairs.jsonl"

with open(out_path, "w", encoding="utf-8") as f:
    for p in pairs:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print("Saved training pairs to:", out_path)

Saved training pairs to: /content/personalized_chatbot/data/train_pairs.jsonl


## Human in the loop (HITL) pipeline

In [17]:
import time

def save_interaction(user_input, model_reply, user_correction=None, reason=None):
    """
    Append a single interaction to feedback.jsonl
    The model learns from mistakes later.
    """
    rec = {
        "time": time.time(),
        "user_input": user_input,
        "model_reply": model_reply,
        "user_correction": user_correction,
        "accepted": user_correction is None,
        "reason": reason,
    }

    with open(HITL_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    return rec

print("HITL pipeline ready!")

HITL pipeline ready!


## convert HITL logs to fine-tuning dataset in JSONL

In [18]:
def convert_feedback_to_finetune():
    src = HITL_FILE
    out = FINETUNE_PREP / "hitl_for_finetune.jsonl"

    if not Path(src).exists():
        print("No feedback yet.")
        return None

    count = 0
    with open(src, "r", encoding="utf-8") as f, open(out, "w", encoding="utf-8") as out_f:
        for line in f:
            rec = json.loads(line)
            if rec["accepted"] is False and rec["user_correction"]:
                out_f.write(json.dumps({
                    "prompt": rec["user_input"],
                    "response": rec["user_correction"]
                }, ensure_ascii=False) + "\n")
                count += 1

    print(f"Converted {count} corrected samples → {out}")
    return out

convert_feedback_to_finetune()
# we ignore the accepted responses and only keep the corrections

Converted 1 corrected samples → /content/personalized_chatbot/finetune_prep/hitl_for_finetune.jsonl


PosixPath('/content/personalized_chatbot/finetune_prep/hitl_for_finetune.jsonl')

##How Fine-Tuning Would Be Done (LoRA Prep) <<DON'T RUN>>

In [19]:
# # WARNING: DO NOT RUN ON COLAB FREE.
# # This is for your Milestone documentation.

# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# from transformers import TrainingArguments, Trainer
# from datasets import load_dataset

# # Load dataset
# train_data_path = str(FINETUNE_PREP / "hitl_for_finetune.jsonl")
# dataset = load_dataset("json", data_files=train_data_path, split="train")

# def tokenize(entry):
#     # Format: "### Prompt" pattern helps the model learn dialog structure
#     inp = "### Prompt:\n" + entry["prompt"] + "\n\n### Response:\n"
#     txt = inp + entry["response"]

#     tok = tokenizer(txt, truncation=True, max_length=512)

#     # Label masking: prompt tokens = -100 (ignored)
#     labels = tok["input_ids"].copy()
#     prompt_len = len(tokenizer(inp)["input_ids"])
#     labels[:prompt_len] = [-100] * prompt_len

#     tok["labels"] = labels
#     return tok

# dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# # LoRA config
# lora_cfg = LoraConfig(
#     r=8, lora_alpha=32,
#     target_modules=["q_proj","k_proj","v_proj","o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# lora_model = prepare_model_for_kbit_training(model)
# lora_model = get_peft_model(lora_model, lora_cfg)

# training_args = TrainingArguments(
#     output_dir=str(WORKDIR / "lora_ckpt"),
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=8,
#     num_train_epochs=1,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_steps=10
# )

# trainer = Trainer(
#     model=lora_model,
#     args=training_args,
#     train_dataset=dataset
# )

# print("Training pipeline ready (but not running).")

In [20]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True
Device: Tesla T4


## simple chat loop

In [21]:
history = []

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        break

    reply = generate_reply(user_input, history)
    print("Assistant:", reply)

    correction = input("Correction? (empty = accepted): ").strip()
    if correction:
        save_interaction(user_input, reply, correction, reason="manual feedback")
        history.append({"role": "user", "content": user_input})
        history.append({"role": "assistant", "content": correction})
    else:
        save_interaction(user_input, reply)
        history.append({"role": "user", "content": user_input})
        history.append({"role": "assistant", "content": reply})

You: hey mate
Assistant: Hello there. How are you doing today?
Correction? (empty = accepted): 
You: I am doing amazing how about you
Assistant: I'm doing well, thanks for asking. How's your day going so far?
Correction? (empty = accepted): 
You: exhausting a bit but it's fine
Assistant: Well, take a break and relax. You deserve it. What's the most exciting thing that's happened to you recently?
Correction? (empty = accepted): 
You: THAT'S WHAT I WILL DO... I MIGHT GO WATCH HARRY POTTER
Assistant: That sounds like a great plan! What's your favorite movie?
Correction? (empty = accepted): 
You: the sorcerer's stone
Assistant: That's a classic! What's your favorite book?
Correction? (empty = accepted): 
You: hmm good question.. i think the cursed child
Assistant: That's a great choice! What's your favorite TV show?
Correction? (empty = accepted): 
You: enough of that I want to talk about the pyramids
Assistant: That's a great topic! What do you think is the most fascinating thing about th

KeyboardInterrupt: Interrupted by user