In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [None]:
# First, clear the current model from memory
torch.cuda.empty_cache()

import gc
gc.collect()

# Check memory freed
!nvidia-smi

AcceleratorError: CUDA error: invalid argument
Search for `cudaErrorInvalidValue' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# this is our start

In [1]:
# ---------------------------
# CLEAN PROJECT FOLDER SETUP
# ---------------------------
# Why this is here:
# - Every Colab session is clean.
# - We need a stable folder to store data, logs, HITL feedback.

from pathlib import Path

WORKDIR = Path("/content/personalized_chatbot")
WORKDIR.mkdir(exist_ok=True)

DATA_DIR = WORKDIR / "data"
DATA_DIR.mkdir(exist_ok=True)

HITL_FILE = WORKDIR / "feedback.jsonl"      # where Human-in-the-Loop corrections go
FINETUNE_PREP = WORKDIR / "finetune_prep"
FINETUNE_PREP.mkdir(exist_ok=True)

print("WORKDIR:", WORKDIR)
print("DATA_DIR:", DATA_DIR)
print("FINETUNE_PREP:", FINETUNE_PREP)


WORKDIR: /content/personalized_chatbot
DATA_DIR: /content/personalized_chatbot/data
FINETUNE_PREP: /content/personalized_chatbot/finetune_prep


In [2]:
# Cell 1A: Extended directory structure for MLOps pipeline (FIXED)
# Place this RIGHT AFTER your existing Cell 1 (folder setup)

# Create additional directories for pipeline
PROCESSED_IDS_FILE = WORKDIR / "processed_feedback_ids.json"

# Use FINETUNE_PREP from your original notebook instead of creating BATCH_DIR
if 'BATCH_DIR' not in globals():
    BATCH_DIR = FINETUNE_PREP  # Use existing directory

BATCH_DIR.mkdir(exist_ok=True)

# Create empty processed IDs file if doesn't exist
if not PROCESSED_IDS_FILE.exists():
    import json
    with open(PROCESSED_IDS_FILE, 'w') as f:
        json.dump([], f)

print("Extended MLOps directories:")
print(f"  Processed IDs: {PROCESSED_IDS_FILE}")
print(f"  Training batches: {BATCH_DIR}")

Extended MLOps directories:
  Processed IDs: /content/personalized_chatbot/processed_feedback_ids.json
  Training batches: /content/personalized_chatbot/finetune_prep


In [3]:
# LLaMA inference (transformers + bitsandbytes)
# LoRA (peft)
# dataset management (datasets)
# orchestration (langchain)
!pip install -q transformers accelerate bitsandbytes peft datasets langchain sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Cell 2A: Install MLOps dependencies (FIXED for Colab)
# Place this RIGHT AFTER your existing Cell 2 (pip install)

# Prefect has dependency issues on Colab, so we'll use a simpler orchestration approach
# We'll track pipeline runs manually with timestamps and logs

# Install only what we absolutely need
!pip install -q datasets pandas

print("✓ Pipeline dependencies installed")
print("  Note: Using lightweight pipeline tracking instead of Prefect")

✓ Pipeline dependencies installed
  Note: Using lightweight pipeline tracking instead of Prefect


In [5]:
# Cell 2B: Simple Pipeline Orchestration (Prefect Alternative)
# Place after Cell 2A

import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Callable
import traceback

class PipelineRun:
    """
    Lightweight pipeline orchestration without external dependencies.

    Tracks:
    - Task execution times
    - Success/failure status
    - Error messages
    - Run metadata
    """

    def __init__(self, name: str, log_dir: Path):
        self.name = name
        self.log_dir = log_dir
        self.log_dir.mkdir(exist_ok=True)

        self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.start_time = time.time()
        self.tasks = []
        self.status = "running"

    def run_task(self, task_name: str, task_func: Callable, *args, **kwargs):
        """
        Execute a task with error handling and logging.

        Args:
            task_name: Human-readable task name
            task_func: Function to execute
            *args, **kwargs: Arguments to pass to task_func

        Returns:
            Task result if successful, None if failed
        """
        print(f"\n{'='*60}")
        print(f"📋 Task: {task_name}")
        print(f"{'='*60}")

        task_start = time.time()
        task_record = {
            "name": task_name,
            "start_time": task_start,
            "status": "running"
        }

        try:
            # Execute task
            result = task_func(*args, **kwargs)

            # Record success
            task_record["status"] = "success"
            task_record["duration"] = time.time() - task_start
            task_record["result_summary"] = str(result)[:200] if result else "None"

            print(f"✅ Task completed in {task_record['duration']:.2f}s")

            self.tasks.append(task_record)
            return result

        except Exception as e:
            # Record failure
            task_record["status"] = "failed"
            task_record["duration"] = time.time() - task_start
            task_record["error"] = str(e)
            task_record["traceback"] = traceback.format_exc()

            print(f"❌ Task failed: {e}")

            self.tasks.append(task_record)
            self.status = "failed"

            return None

    def finish(self):
        """Complete the pipeline run and save log"""
        self.duration = time.time() - self.start_time

        if self.status != "failed":
            self.status = "completed"

        # Save run log
        log_file = self.log_dir / f"run_{self.run_id}.json"

        run_log = {
            "run_id": self.run_id,
            "pipeline": self.name,
            "status": self.status,
            "start_time": datetime.fromtimestamp(self.start_time).isoformat(),
            "duration_seconds": self.duration,
            "tasks": self.tasks
        }

        with open(log_file, 'w') as f:
            json.dump(run_log, f, indent=2)

        # Print summary
        print(f"\n{'='*60}")
        print(f"📊 PIPELINE SUMMARY")
        print(f"{'='*60}")
        print(f"Run ID: {self.run_id}")
        print(f"Status: {self.status.upper()}")
        print(f"Duration: {self.duration:.2f}s")
        print(f"Tasks: {len(self.tasks)}")

        for task in self.tasks:
            status_icon = "✅" if task["status"] == "success" else "❌"
            print(f"  {status_icon} {task['name']}: {task.get('duration', 0):.2f}s")

        print(f"\nLog saved: {log_file}")

        return run_log

# Create pipeline logs directory
PIPELINE_LOGS = WORKDIR / "pipeline_logs"
PIPELINE_LOGS.mkdir(exist_ok=True)

print("✓ Pipeline orchestration ready")
print(f"  Logs directory: {PIPELINE_LOGS}")

✓ Pipeline orchestration ready
  Logs directory: /content/personalized_chatbot/pipeline_logs


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch


# MODEL = 'michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_finetuned' # 3B model
MODEL = 'pierreramez/llama3.1-finetuned-v2' # 8B model

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
                                             MODEL,
                                             load_in_4bit=True, # to enable quantization
                                             device_map='auto', # to automatically map the layers to GPU
                                             torch_dtype=torch.float16,
                                             trust_remote_code=True #required for llama
                                            )

model.config.pad_token_id = tokenizer.pad_token_id

# text gen pipeline
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.2, # low temp to make it more deterministic
    top_p=0.9,
    repetition_penalty=1.2
)

print('Model loaded successfully!\n')

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully!



In [18]:
def generate_reply(user_input, history, max_turns=4):
    """
    history = list of dicts: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
    """
    # Truncate history to last max_turns exchanges
    truncated_history = history[-(max_turns * 2):]

    # Add new user message
    messages = truncated_history + [{"role": "user", "content": user_input}]

    # Apply the SAME chat template used during training
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    attention_mask = (inputs != tokenizer.pad_token_id).long()

    with torch.no_grad():
        output = model.generate(
            input_ids=inputs,
            attention_mask=attention_mask,
            max_new_tokens=128,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode only new tokens
    response = tokenizer.decode(output[0][inputs.shape[1]:], skip_special_tokens=True)
    return response.strip()

In [19]:
import re, json, html

def clean_text(s):
  s = html.unescape(s)
  s= re.sub(r'\s+',' ',s).strip()
  return s

def chat_to_pairs(chat_log, max_user_context=4):
  '''
  chat_log is like:
  [
    {role: 'user', content: '...'},
    {role: 'assistant', content: '...'},
    ...
  ]

  We convert multi-turn chat into supervised training pairs.
  '''

  pairs = []
  for i in range(len(chat_log) - 1):
      if chat_log[i]["role"] == "user" and chat_log[i+1]["role"] == "assistant":
          # Build concise prompt
          ctx_start = max(0, i - max_user_context*2)
          ctx = chat_log[ctx_start:i+1]

          prompt = " ".join(f"{t['role']}: {clean_text(t['content'])}" for t in ctx)
          response = clean_text(chat_log[i+1]["content"])

          pairs.append({"prompt": prompt, "response": response})

  return pairs

example_chat = [
    {"role":"user","content":"Explain normalization."},
    {"role":"assistant","content":"Normalization rescales features to stable ranges."},
    {"role":"user","content":"Show formula."},
    {"role":"assistant","content":"z = (x - µ) / σ"}
]

pairs = chat_to_pairs(example_chat)
pairs

[{'prompt': 'user: Explain normalization.',
  'response': 'Normalization rescales features to stable ranges.'},
 {'prompt': 'user: Explain normalization. assistant: Normalization rescales features to stable ranges. user: Show formula.',
  'response': 'z = (x - µ) / σ'}]

In [20]:
out_path = DATA_DIR / "train_pairs.jsonl"

with open(out_path, "w", encoding="utf-8") as f:
    for p in pairs:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print("Saved training pairs to:", out_path)

Saved training pairs to: /content/personalized_chatbot/data/train_pairs.jsonl


## Human in the loop (HITL) pipeline

In [21]:
import time

def save_interaction(user_input, model_reply, user_correction=None, reason=None):
    """
    Append a single interaction to feedback.jsonl
    The model learns from mistakes later.
    """
    rec = {
        "time": time.time(),
        "user_input": user_input,
        "model_reply": model_reply,
        "user_correction": user_correction,
        "accepted": user_correction is None,
        "reason": reason,
    }

    with open(HITL_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    return rec

print("HITL pipeline ready!")

HITL pipeline ready!


In [22]:
# Cell 7A: FIXED FeedbackManager (REPLACE the old one completely)
# Place after Cell 7 (save_interaction)

import json
import hashlib
from datetime import datetime
from typing import List, Dict, Optional

class FeedbackManager:
    """
    Fixed version that separates reading from processing.

    Key fix: get_new_corrections() now does NOT mark items as processed.
    Only mark_as_processed() does that, which we call AFTER successful training.
    """

    def __init__(self, feedback_file: Path, processed_ids_file: Path):
        self.feedback_file = feedback_file
        self.processed_ids_file = processed_ids_file
        self.processed_ids = self._load_processed_ids()

    def _load_processed_ids(self) -> set:
        """Load IDs of feedback already used for training"""
        if not self.processed_ids_file.exists():
            return set()

        try:
            with open(self.processed_ids_file, 'r') as f:
                data = json.load(f)
                return set(data) if isinstance(data, list) else set()
        except:
            return set()

    def _save_processed_ids(self):
        """Persist processed IDs to disk"""
        with open(self.processed_ids_file, 'w') as f:
            json.dump(list(self.processed_ids), f)

    def _generate_feedback_id(self, interaction: Dict) -> str:
        """Create unique ID from interaction"""
        content = f"{interaction['user_input']}{interaction['time']}"
        return hashlib.md5(content.encode()).hexdigest()

    def get_new_corrections(self) -> List[Dict]:
        """
        Extract NEW corrections (not yet processed).

        CRITICAL: This does NOT mark them as processed!
        Call mark_as_processed() after successful training.
        """
        if not self.feedback_file.exists():
            return []

        new_corrections = []

        with open(self.feedback_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    interaction = json.loads(line.strip())

                    # Generate ID
                    interaction_id = self._generate_feedback_id(interaction)

                    # Skip if already processed
                    if interaction_id in self.processed_ids:
                        continue

                    # Only corrections (accepted=False means user corrected)
                    if interaction.get('accepted') is False and interaction.get('user_correction'):
                        new_corrections.append({
                            'id': interaction_id,
                            'prompt': interaction['user_input'],
                            'response': interaction['user_correction'],
                            'timestamp': interaction['time'],
                            'reason': interaction.get('reason', 'user_correction')
                        })

                except json.JSONDecodeError:
                    continue

        return new_corrections

    def mark_as_processed(self, correction_ids: List[str]):
        """
        Mark corrections as processed after successful training.

        Call this ONLY after training completes successfully.
        """
        self.processed_ids.update(correction_ids)
        self._save_processed_ids()
        print(f"✅ Marked {len(correction_ids)} corrections as processed")

# Initialize feedback manager
feedback_mgr = FeedbackManager(
    feedback_file=HITL_FILE,
    processed_ids_file=PROCESSED_IDS_FILE
)

print("✅ Fixed FeedbackManager initialized")
print(f"   Tracking: {HITL_FILE}")
print(f"   Already processed: {len(feedback_mgr.processed_ids)} interactions")

✅ Fixed FeedbackManager initialized
   Tracking: /content/personalized_chatbot/feedback.jsonl
   Already processed: 0 interactions


In [23]:
# Cell 7B: Training Data Preparation
# Place RIGHT AFTER Cell 7A (Feedback Manager)

from datasets import Dataset
import pandas as pd

class TrainingDataPreparator:
    """
    Converts feedback corrections into model training format.

    Critical: Must match the instruction format your model was trained on.
    Your model expects: instruction + input + output structure.
    """

    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
        self.output_dir.mkdir(exist_ok=True)

    def prepare_training_batch(self, corrections: List[Dict]) -> Optional[Path]:
        """
        Transform corrections into training-ready format.

        Format:
        - instruction: The user's question/prompt
        - input: Empty for conversational models
        - output: The corrected response
        """
        if not corrections:
            print("ℹ️  No corrections to prepare")
            return None

        # Convert to instruction-following format
        training_examples = []
        for corr in corrections:
            training_examples.append({
                'instruction': corr['prompt'],
                'input': '',  # Empty for chat models
                'output': corr['response'],
                'metadata': {
                    'id': corr['id'],
                    'timestamp': corr['timestamp'],
                    'reason': corr['reason']
                }
            })

        # Save as JSONL with timestamp
        batch_file = self.output_dir / f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"

        with open(batch_file, 'w', encoding='utf-8') as f:
            for example in training_examples:
                f.write(json.dumps(example, ensure_ascii=False) + '\n')

        print(f"✓ Prepared {len(training_examples)} training examples")
        print(f"  Saved to: {batch_file}")

        return batch_file

    def create_huggingface_dataset(self, jsonl_file: Path) -> Dataset:
        """
        Load prepared JSONL into HuggingFace Dataset.
        This is what the Trainer expects.
        """
        data = []
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line))

        df = pd.DataFrame(data)
        dataset = Dataset.from_pandas(df)

        print(f"✓ HuggingFace dataset: {len(dataset)} examples")
        return dataset

# Initialize preparator
data_prep = TrainingDataPreparator(output_dir=BATCH_DIR)

print("✓ Training data preparator ready")
print(f"  Output directory: {BATCH_DIR}")

✓ Training data preparator ready
  Output directory: /content/personalized_chatbot/finetune_prep


In [24]:
# Cell 7C: Pipeline Tasks (Updated for lightweight orchestration)
# Place after Cell 7B

def collect_new_feedback() -> List[Dict]:
    """
    Task: Extract new corrections from feedback log.
    """
    print("📥 Collecting feedback...")
    corrections = feedback_mgr.get_new_corrections()

    if corrections:
        print(f"✓ Found {len(corrections)} new corrections")
        for i, corr in enumerate(corrections[:3], 1):
            print(f"  {i}. {corr['prompt'][:60]}...")
    else:
        print("ℹ️  No new corrections found")

    return corrections

def prepare_data_for_training(corrections: List[Dict]) -> Optional[Path]:
    """
    Task: Format corrections into training dataset.
    """
    if not corrections:
        print("⚠️  Skipping - no corrections")
        return None

    print("🔧 Preparing training data...")
    batch_file = data_prep.prepare_training_batch(corrections)
    return batch_file

def validate_training_dataset(batch_file: Optional[Path]) -> bool:
    """
    Task: Validate prepared dataset.
    """
    if batch_file is None:
        return False

    print("✅ Validating dataset...")

    try:
        with open(batch_file, 'r') as f:
            lines = f.readlines()

        if len(lines) == 0:
            print("❌ Dataset is empty")
            return False

        first_ex = json.loads(lines[0])
        required = {'instruction', 'input', 'output'}
        if not required.issubset(first_ex.keys()):
            print(f"❌ Missing required fields. Has: {first_ex.keys()}")
            return False

        print(f"✓ Dataset valid ({len(lines)} examples)")
        return True

    except Exception as e:
        print(f"❌ Validation failed: {e}")
        return False

print("✓ Pipeline tasks defined")

✓ Pipeline tasks defined


In [25]:
# Cell 7D: Data Pipeline Execution (UPDATED)
# REPLACE the old Cell 7D with this

def run_data_pipeline() -> Dict:
    """
    Execute the data preparation pipeline.

    Now marks corrections as processed AFTER successful preparation.
    """
    pipeline = PipelineRun(
        name="data_preparation",
        log_dir=PIPELINE_LOGS
    )

    print(f"\n🚀 STARTING DATA PREPARATION PIPELINE")
    print(f"Run ID: {pipeline.run_id}\n")

    # Task 1: Collect feedback (does NOT mark as processed yet)
    corrections = pipeline.run_task(
        "collect_feedback",
        collect_new_feedback
    )

    if not corrections:
        pipeline.finish()
        return {
            "status": "no_data",
            "dataset_path": None,
            "num_examples": 0
        }

    # Task 2: Prepare training data
    batch_file = pipeline.run_task(
        "prepare_training_data",
        prepare_data_for_training,
        corrections
    )

    # Task 3: Validate
    is_valid = pipeline.run_task(
        "validate_dataset",
        validate_training_dataset,
        batch_file
    )

    # Finish pipeline
    run_log = pipeline.finish()

    # Mark as processed ONLY if successful
    if is_valid and batch_file:
        correction_ids = [c['id'] for c in corrections]
        feedback_mgr.mark_as_processed(correction_ids)

        return {
            "status": "success",
            "dataset_path": str(batch_file),
            "num_examples": len(corrections),
            "run_id": pipeline.run_id
        }
    else:
        return {
            "status": "failed",
            "dataset_path": None,
            "num_examples": 0,
            "run_id": pipeline.run_id
        }

print("✅ Data pipeline ready (with proper processing)")
print("\n💡 To run: result = run_data_pipeline()")

✅ Data pipeline ready (with proper processing)

💡 To run: result = run_data_pipeline()


##How Fine-Tuning Would Be Done (LoRA Prep) <<DON'T RUN>>

In [None]:
# # WARNING: DO NOT RUN ON COLAB FREE.
# # This is for your Milestone documentation.

# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# from transformers import TrainingArguments, Trainer
# from datasets import load_dataset

# # Load dataset
# train_data_path = str(FINETUNE_PREP / "hitl_for_finetune.jsonl")
# dataset = load_dataset("json", data_files=train_data_path, split="train")

# def tokenize(entry):
#     # Format: "### Prompt" pattern helps the model learn dialog structure
#     inp = "### Prompt:\n" + entry["prompt"] + "\n\n### Response:\n"
#     txt = inp + entry["response"]

#     tok = tokenizer(txt, truncation=True, max_length=512)

#     # Label masking: prompt tokens = -100 (ignored)
#     labels = tok["input_ids"].copy()
#     prompt_len = len(tokenizer(inp)["input_ids"])
#     labels[:prompt_len] = [-100] * prompt_len

#     tok["labels"] = labels
#     return tok

# dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# # LoRA config
# lora_cfg = LoraConfig(
#     r=8, lora_alpha=32,
#     target_modules=["q_proj","k_proj","v_proj","o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# lora_model = prepare_model_for_kbit_training(model)
# lora_model = get_peft_model(lora_model, lora_cfg)

# training_args = TrainingArguments(
#     output_dir=str(WORKDIR / "lora_ckpt"),
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=8,
#     num_train_epochs=1,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_steps=10
# )

# trainer = Trainer(
#     model=lora_model,
#     args=training_args,
#     train_dataset=dataset
# )

# print("Training pipeline ready (but not running).")

In [16]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True
Device: Tesla T4


## simple chat loop

In [28]:
history = []

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit","quit"]:
        break

    reply = generate_reply(user_input, history)
    print("Assistant:", reply)

    # Log HITL?
    correction = input("Correction? (empty = accepted): ").strip()
    if correction:
        save_interaction(user_input, reply, correction, reason="manual feedback")
    else:
        save_interaction(user_input, reply)

    history.append({"role": "user", "content": user_input})
    history.append({"role": "assistant", "content": reply})

You: Hey bro
Assistant: Hey man. How's it going?
Correction? (empty = accepted): everything is fine hbu?
You: everything is good
Assistant: cool. What's up?
Correction? (empty = accepted): 
You: how old was hitler when he died?
Assistant: he died at the age of 56
Correction? (empty = accepted): 
You: wow didn't know that
Assistant: yeah I didn't know that either
Correction? (empty = accepted): Yeah, knowledge is great!
You: have you ever been to a concert before?
Assistant: yeah I went to see the who live
Correction? (empty = accepted): No, I haven't. I am just an AI chatbot.
You: do you know Pierre Ramez?
Assistant: yeah he is a french philosopher
Correction? (empty = accepted): No, can you tell me more about him?
You: who is your favorite philosopher?
Assistant: I don't really have one I like a lot of philosophers
Correction? (empty = accepted): 
You: give me 3 names
Assistant: ok ok ok... Jean-Paul Sartre, Simone de Beauvoir, and Martin Heidegger
Correction? (empty = accepted): 
You

KeyboardInterrupt: Interrupted by user

## convert HITL logs to fine-tuning dataset in JSONL

In [29]:
def convert_feedback_to_finetune():
    src = HITL_FILE
    out = FINETUNE_PREP / "hitl_for_finetune.jsonl"

    if not Path(src).exists():
        print("No feedback yet.")
        return None

    count = 0
    with open(src, "r", encoding="utf-8") as f, open(out, "w", encoding="utf-8") as out_f:
        for line in f:
            rec = json.loads(line)
            if rec["accepted"] is False and rec["user_correction"]:
                out_f.write(json.dumps({
                    "prompt": rec["user_input"],
                    "response": rec["user_correction"]
                }, ensure_ascii=False) + "\n")
                count += 1

    print(f"Converted {count} corrected samples → {out}")
    return out

convert_feedback_to_finetune()
# we ignore the accepted responses and only keep the corrections

Converted 14 corrected samples → /content/personalized_chatbot/finetune_prep/hitl_for_finetune.jsonl


PosixPath('/content/personalized_chatbot/finetune_prep/hitl_for_finetune.jsonl')

# using Prefect to orchestrate the pipeline

In [30]:
# Cell 12: Run the Data Pipeline (UPDATED)
# Place AFTER your chat loop (Cell 11)

# Check available corrections
available_corrections = feedback_mgr.get_new_corrections()
print(f"📊 Available corrections: {len(available_corrections)}")

if len(available_corrections) == 0:
    print("\n⚠️  No corrections found!")
    print("   Instructions:")
    print("   1. Go back to Cell 11 (chat loop)")
    print("   2. Chat with the bot")
    print("   3. When it gives wrong answers, provide corrections")
    print("   4. Come back here and run this cell again")
else:
    print(f"\n✅ Found {len(available_corrections)} corrections")
    print("   Running pipeline...\n")

    # Run the pipeline
    result = run_data_pipeline()

    # Display results
    if result["status"] == "success":
        print(f"\n🎉 SUCCESS!")
        print(f"   Training data ready: {result['dataset_path']}")
        print(f"   Examples: {result['num_examples']}")
        print(f"   Run ID: {result['run_id']}")
        print("\n📝 Next: Scroll down to Cell 13D to start training")
    else:
        print("\n⚠️  Pipeline completed but no data generated")
        print("   Check the logs above for details")

📊 Available corrections: 14

✅ Found 14 corrections
   Running pipeline...


🚀 STARTING DATA PREPARATION PIPELINE
Run ID: 20251123_221025


📋 Task: collect_feedback
📥 Collecting feedback...
✓ Found 14 new corrections
  1. wow didn't know that...
  2. have you ever been to a concert before?...
  3. do you know Pierre Ramez?...
✅ Task completed in 0.00s

📋 Task: prepare_training_data
🔧 Preparing training data...
✓ Prepared 14 training examples
  Saved to: /content/personalized_chatbot/finetune_prep/batch_20251123_221025.jsonl
✅ Task completed in 0.00s

📋 Task: validate_dataset
✅ Validating dataset...
✓ Dataset valid (14 examples)
✅ Task completed in 0.00s

📊 PIPELINE SUMMARY
Run ID: 20251123_221025
Status: COMPLETED
Duration: 0.00s
Tasks: 3
  ✅ collect_feedback: 0.00s
  ✅ prepare_training_data: 0.00s
  ✅ validate_dataset: 0.00s

Log saved: /content/personalized_chatbot/pipeline_logs/run_20251123_221025.json
✅ Marked 14 corrections as processed

🎉 SUCCESS!
   Training data ready: /content

## LoRA training optimized for T4 GPU

In [31]:
# Cell 13A: Training Configuration for T4 GPU
# Place after Cell 12

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import torch
from typing import Optional
import os

class LoRATrainingConfig:
    """
    Training configuration optimized for Colab T4 GPU (15GB VRAM).

    Key optimizations:
    - 4-bit quantization reduces memory by ~75%
    - Small LoRA rank (r=8) keeps adapter tiny
    - Gradient accumulation simulates larger batch sizes
    - Aggressive gradient checkpointing saves memory
    """

    def __init__(self, dataset_path: str, output_dir: Path):
        self.dataset_path = dataset_path
        self.output_dir = output_dir

        # LoRA Configuration
        # r=8 means we're adding 8-rank decomposition matrices
        # Lower r = less parameters = less memory = faster training
        # But too low = model can't learn enough
        self.lora_config = LoraConfig(
            r=8,                          # Rank (8 is sweet spot for T4)
            lora_alpha=32,                # Scaling factor (typically 2-4x rank)
            target_modules=[              # Which layers to adapt
                "q_proj",                 # Query projection
                "k_proj",                 # Key projection
                "v_proj",                 # Value projection
                "o_proj"                  # Output projection
            ],
            lora_dropout=0.05,            # Regularization
            bias="none",                  # Don't adapt bias terms
            task_type=TaskType.CAUSAL_LM  # Causal language modeling
        )

        # 4-bit Quantization Config
        # This is THE trick that makes 8B models fit on T4
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,                    # Enable 4-bit
            bnb_4bit_quant_type="nf4",           # "Normal Float 4" - best quality
            bnb_4bit_compute_dtype=torch.float16, # Compute in FP16
            bnb_4bit_use_double_quant=True       # Double quantization saves more memory
        )

        # Training Arguments
        # These are VERY conservative to avoid OOM
        self.training_args = TrainingArguments(
            output_dir=str(output_dir),

            # Batch size: Start tiny!
            per_device_train_batch_size=1,        # Only 1 example at a time
            gradient_accumulation_steps=8,        # But accumulate 8 steps = effective batch of 8

            # Epochs: 1-2 is enough for small datasets
            num_train_epochs=2,

            # Learning rate: Higher than normal because LoRA needs strong signal
            learning_rate=2e-4,                   # 0.0002

            # Memory optimizations
            fp16=True,                            # Use half precision
            gradient_checkpointing=True,          # Trade compute for memory
            optim="paged_adamw_8bit",            # 8-bit optimizer (saves 4GB!)

            # Logging and checkpointing
            logging_steps=5,                      # Log every 5 steps
            save_strategy="epoch",                # Save after each epoch
            save_total_limit=2,                   # Keep only 2 checkpoints

            # Misc
            warmup_steps=10,                      # Gradual learning rate warmup
            report_to=[],                         # Don't report to wandb/tensorboard
            remove_unused_columns=False,          # Keep all columns for debugging
        )

    def get_memory_usage(self):
        """Check current GPU memory usage"""
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3  # Convert to GB
            reserved = torch.cuda.memory_reserved() / 1024**3
            return f"Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB"
        return "CUDA not available"

# Initialize config (but don't load model yet)
training_config = None  # We'll set this in the next cell

print("✓ Training configuration class defined")
print("\n🔧 Optimizations for T4 GPU:")
print("  • 4-bit quantization (saves ~10GB)")
print("  • LoRA rank 8 (keeps adapter small)")
print("  • Batch size 1 + gradient accumulation 8")
print("  • 8-bit optimizer (saves ~4GB)")
print("  • Gradient checkpointing enabled")

✓ Training configuration class defined

🔧 Optimizations for T4 GPU:
  • 4-bit quantization (saves ~10GB)
  • LoRA rank 8 (keeps adapter small)
  • Batch size 1 + gradient accumulation 8
  • 8-bit optimizer (saves ~4GB)
  • Gradient checkpointing enabled


### Dataset tokenization

In [32]:
# Cell 13B: Dataset Preparation and Tokenization
# Place after Cell 13A

def prepare_training_dataset(dataset_path: str, tokenizer):
    """
    Load dataset and tokenize for training.

    Critical step: We need to convert text into token IDs that the model understands.
    Also creates labels (what the model should predict).
    """
    from datasets import load_dataset

    # Load the JSONL file we prepared
    print(f"📂 Loading dataset from: {dataset_path}")
    dataset = load_dataset('json', data_files=dataset_path, split='train')
    print(f"   Loaded {len(dataset)} examples")

    # Show a sample
    print("\n📄 Sample training example:")
    print(f"   Instruction: {dataset[0]['instruction'][:100]}...")
    print(f"   Output: {dataset[0]['output'][:100]}...")

    def format_instruction(example):
        """
        Convert instruction-input-output into a single training text.

        Format matches your model's training template:
        ### Instruction: {question}
        ### Response: {answer}
        """
        instruction = example['instruction']
        input_text = example['input']
        output = example['output']

        # Build prompt
        if input_text:
            prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
        else:
            prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"

        # Full text = prompt + response
        full_text = prompt + output

        return {"text": full_text}

    # Apply formatting
    print("\n🔧 Formatting examples...")
    dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)

    def tokenize_function(examples):
        """
        Convert text to token IDs.

        Why truncate? T4 can't handle super long sequences.
        512 tokens = ~400 words, enough for most corrections.
        """
        result = tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,              # Keep sequences short
            padding="max_length",        # Pad to same length
            return_tensors=None          # Return lists, not tensors
        )

        # Labels = input_ids (model predicts next token from sequence)
        result["labels"] = result["input_ids"].copy()

        return result

    # Tokenize
    print("🔤 Tokenizing...")
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names,
        desc="Tokenizing"
    )

    print(f"✓ Tokenized {len(tokenized_dataset)} examples")
    print(f"   Max sequence length: 512 tokens")

    return tokenized_dataset

print("✓ Dataset preparation function defined")

✓ Dataset preparation function defined


### Main training function

In [33]:
# Cell 13C: Main Training Function
# Place after Cell 13B

def train_lora_adapter(
    dataset_path: str,
    output_name: str = "llama-lora-adapter",
    push_to_hub: bool = True,
    hub_repo_name: Optional[str] = None
):
    """
    Train a LoRA adapter on corrected examples.

    Args:
        dataset_path: Path to prepared JSONL dataset
        output_name: Local directory name for saving
        push_to_hub: Whether to push to Hugging Face Hub
        hub_repo_name: HF repo name (e.g., "yourusername/model-name")

    Returns:
        Path to trained adapter
    """

    print("\n" + "="*70)
    print("🚀 STARTING LORA TRAINING")
    print("="*70)

    # Setup output directory
    adapter_output_dir = WORKDIR / output_name
    adapter_output_dir.mkdir(exist_ok=True)

    # Initialize config
    global training_config
    training_config = LoRATrainingConfig(
        dataset_path=dataset_path,
        output_dir=adapter_output_dir
    )

    print(f"\n📊 Initial GPU memory: {training_config.get_memory_usage()}")

    # Step 1: Load tokenizer (already loaded, but ensure padding token)
    print("\n🔤 Setting up tokenizer...")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print(f"   Pad token: {tokenizer.pad_token}")

    # Step 2: Load and tokenize dataset
    print("\n📂 Preparing dataset...")
    train_dataset = prepare_training_dataset(dataset_path, tokenizer)

    if len(train_dataset) < 5:
        print("\n⚠️  WARNING: Less than 5 examples!")
        print("   Training might not improve the model meaningfully.")
        print("   Recommended: Collect at least 20 corrections.")
        proceed = input("\n   Continue anyway? (y/N): ").strip().lower()
        if proceed != 'y':
            print("❌ Training cancelled")
            return None

    # Step 3: Prepare model for training
    print("\n🔧 Preparing model for LoRA training...")
    print("   This will take 2-3 minutes...")

    # The model is already loaded from Cell 3, but we need to prepare it
    # for training (enable gradient checkpointing, etc.)
    model_for_training = prepare_model_for_kbit_training(model)

    print(f"   GPU memory after prep: {training_config.get_memory_usage()}")

    # Step 4: Add LoRA adapters
    print("\n🎯 Adding LoRA adapters...")
    lora_model = get_peft_model(model_for_training, training_config.lora_config)

    # Print trainable parameters
    trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in lora_model.parameters())
    trainable_percent = 100 * trainable_params / total_params

    print(f"   Trainable params: {trainable_params:,} ({trainable_percent:.2f}%)")
    print(f"   Total params: {total_params:,}")
    print(f"   Memory: {training_config.get_memory_usage()}")

    # Step 5: Create trainer
    print("\n🏋️  Creating trainer...")

    # Data collator handles batching and padding
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're doing causal LM, not masked LM
    )

    trainer = Trainer(
        model=lora_model,
        args=training_config.training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # Step 6: Train!
    print("\n🎓 Starting training...")
    print("   This will take 10-20 minutes depending on dataset size.")
    print("   Watch the loss - it should decrease.\n")

    try:
        train_result = trainer.train()

        print("\n✅ Training completed!")
        print(f"   Final loss: {train_result.training_loss:.4f}")

    except RuntimeError as e:
        if "out of memory" in str(e):
            print("\n❌ OUT OF MEMORY ERROR!")
            print("\n🔧 Try these fixes:")
            print("   1. Reduce per_device_train_batch_size to 1 (if not already)")
            print("   2. Increase gradient_accumulation_steps to 16")
            print("   3. Reduce max_length to 256 in tokenization")
            print("   4. Use a smaller LoRA rank (r=4)")
            print("\n   Restart runtime and try again.")
            return None
        else:
            raise e

    # Step 7: Save adapter
    print(f"\n💾 Saving adapter to {adapter_output_dir}...")
    lora_model.save_pretrained(adapter_output_dir)
    tokenizer.save_pretrained(adapter_output_dir)

    print("✓ Adapter saved locally")

    # Step 8: Push to Hugging Face Hub (optional)
    if push_to_hub:
        if hub_repo_name is None:
            print("\n⚠️  No hub_repo_name provided. Skipping push to HF Hub.")
            print("   To push later, run:")
            print(f"   huggingface-cli upload {hub_repo_name} {adapter_output_dir}")
        else:
            print(f"\n☁️  Pushing to Hugging Face Hub: {hub_repo_name}")
            try:
                lora_model.push_to_hub(hub_repo_name)
                tokenizer.push_to_hub(hub_repo_name)
                print(f"✅ Pushed to https://huggingface.co/{hub_repo_name}")
            except Exception as e:
                print(f"⚠️  Push failed: {e}")
                print("   You can push manually later using huggingface-cli")

    print("\n" + "="*70)
    print("🎉 TRAINING COMPLETE!")
    print("="*70)
    print(f"Adapter location: {adapter_output_dir}")
    print(f"Memory used: {training_config.get_memory_usage()}")

    return adapter_output_dir

print("✓ Training function defined")
print("\n💡 Ready to train! See Cell 13D for execution.")

✓ Training function defined

💡 Ready to train! See Cell 13D for execution.


Diagnostic cell

In [34]:
# # Diagnostic Cell: Check Dataset Location
# # Run this BEFORE Cell 13D

# print("🔍 Checking filesystem...")
# print(f"\nBATCH_DIR: {BATCH_DIR}")
# print(f"Exists: {BATCH_DIR.exists()}")

# if BATCH_DIR.exists():
#     print(f"\nContents of {BATCH_DIR}:")
#     files = list(BATCH_DIR.glob("*"))
#     if files:
#         for f in files:
#             print(f"  - {f.name} ({f.stat().st_size} bytes)")
#     else:
#         print("  (empty directory)")
# else:
#     print("  Directory doesn't exist!")

# # Check FINETUNE_PREP (alternative location)
# print(f"\nFINETUNE_PREP: {FINETUNE_PREP}")
# print(f"Exists: {FINETUNE_PREP.exists()}")

# if FINETUNE_PREP.exists():
#     print(f"\nContents of {FINETUNE_PREP}:")
#     files = list(FINETUNE_PREP.glob("*"))
#     if files:
#         for f in files:
#             print(f"  - {f.name} ({f.stat().st_size} bytes)")
#     else:
#         print("  (empty directory)")

# # Check what the pipeline result said
# print(f"\nLast pipeline result:")
# try:
#     print(f"  Status: {result['status']}")
#     print(f"  Dataset path: {result['dataset_path']}")
#     print(f"  Num examples: {result['num_examples']}")
# except:
#     print("  No 'result' variable found - did you run Cell 12?")

feedback diagnosis

In [35]:
# # Diagnostic: Check Feedback File Contents
# import json

# print(f"📄 Reading: {HITL_FILE}\n")

# if HITL_FILE.exists():
#     with open(HITL_FILE, 'r') as f:
#         lines = f.readlines()

#     print(f"Total interactions: {len(lines)}\n")

#     # Show first 3 interactions
#     print("First 3 interactions:")
#     print("="*70)

#     for i, line in enumerate(lines[:3], 1):
#         try:
#             interaction = json.loads(line.strip())
#             print(f"\n{i}. Interaction:")
#             print(f"   User input: {interaction.get('user_input', 'N/A')[:60]}...")
#             print(f"   Model reply: {interaction.get('model_reply', 'N/A')[:60]}...")
#             print(f"   Accepted: {interaction.get('accepted')}")
#             print(f"   Correction: {interaction.get('user_correction', 'None')}")
#             print(f"   Reason: {interaction.get('reason', 'N/A')}")
#         except Exception as e:
#             print(f"   Error parsing line: {e}")

#     # Count corrections
#     corrections_count = 0
#     accepted_count = 0

#     for line in lines:
#         try:
#             interaction = json.loads(line.strip())
#             if interaction.get('accepted') is False:
#                 corrections_count += 1
#             elif interaction.get('accepted') is True:
#                 accepted_count += 1
#         except:
#             pass

#     print("\n" + "="*70)
#     print(f"📊 Summary:")
#     print(f"   Total interactions: {len(lines)}")
#     print(f"   Accepted responses: {accepted_count}")
#     print(f"   Corrections (accepted=False): {corrections_count}")

#     if corrections_count == 0:
#         print("\n⚠️  PROBLEM FOUND:")
#         print("   You have feedback but NO corrections!")
#         print("\n   This happens when you press Enter without typing a correction.")
#         print("   The chat loop treats empty input as 'accepted'.")

# else:
#     print("❌ Feedback file doesn't exist!")

###Run Training with Interactive Confirmation

In [36]:
# Cell 13D: Run Training with Safety Checks (FIXED)
# Place after Cell 13C

print("🔍 Checking for prepared dataset...")

# Strategy 1: Use the result from Cell 12 if available
dataset_path = None

if 'result' in globals() and result.get('status') == 'success':
    dataset_path = Path(result['dataset_path'])
    print(f"✅ Found dataset from pipeline result:")
    print(f"   {dataset_path}")
else:
    print("⚠️  No 'result' from Cell 12 - searching for batch files...")

    # Strategy 2: Look in all possible locations
    search_dirs = [BATCH_DIR, FINETUNE_PREP, WORKDIR / "training_batches"]

    all_batch_files = []
    for search_dir in search_dirs:
        if search_dir.exists():
            found = list(search_dir.glob("batch_*.jsonl"))
            all_batch_files.extend(found)
            if found:
                print(f"   Found {len(found)} file(s) in {search_dir}")

    if all_batch_files:
        # Use the most recent batch
        dataset_path = sorted(all_batch_files)[-1]
        print(f"✅ Using most recent batch: {dataset_path.name}")
    else:
        print("❌ No batch files found in any location!")
        print("\n🔧 Troubleshooting:")
        print("   1. Check that Cell 12 ran successfully")
        print("   2. Run the diagnostic cell above to see what exists")
        print("   3. Verify BATCH_DIR is set correctly")

# If we found a dataset, proceed
if dataset_path and dataset_path.exists():
    # Count examples
    with open(dataset_path, 'r') as f:
        num_examples = len(f.readlines())

    print(f"\n📊 Dataset Details:")
    print(f"   Path: {dataset_path}")
    print(f"   Examples: {num_examples}")

    # Show recommendations based on dataset size
    if num_examples < 5:
        print("\n⚠️  WARNING: Very few examples!")
        print("   Recommendation: Collect at least 10 corrections first.")
    elif num_examples < 20:
        print("\n⚠️  Small dataset - training will be quick but limited improvement.")
        print("   Recommendation: For better results, aim for 30+ corrections.")
    else:
        print("\n✅ Good dataset size for training!")

    print(f"\n📊 Current GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f}GB / 15GB")

    # Ask for confirmation
    print("\n" + "="*70)
    print("TRAINING CONFIGURATION:")
    print("="*70)
    print(f"  Dataset: {num_examples} examples")
    print(f"  Epochs: 2")
    print(f"  Expected time: {num_examples * 2 * 0.5:.0f}-{num_examples * 2:.0f} minutes")
    print(f"  Output: {WORKDIR}/llama-lora-adapter")
    print("="*70)

    confirm = input("\n🚀 Start training? (y/N): ").strip().lower()

    if confirm == 'y':
        # Optional: Get HuggingFace repo name
        print("\n☁️  Push to Hugging Face Hub after training?")
        push = input("   (y/N): ").strip().lower() == 'y'

        hub_repo = None
        if push:
            hub_repo = input("   Enter repo name (e.g., 'username/model-name'): ").strip()
            if not hub_repo:
                print("   ⚠️  No repo name - will skip pushing")
                push = False

        # Clear some memory before training
        torch.cuda.empty_cache()

        # TRAIN!
        adapter_path = train_lora_adapter(
            dataset_path=str(dataset_path),
            output_name="llama-lora-adapter",
            push_to_hub=push,
            hub_repo_name=hub_repo if push else None
        )

        if adapter_path:
            print(f"\n🎉 SUCCESS! Adapter saved to: {adapter_path}")
            print("\n📝 Next steps:")
            print("   1. Test the adapter (Cell 14)")
            print("   2. Merge with base model (Cell 15)")
            print("   3. Deploy updated model")
    else:
        print("\n❌ Training cancelled")
else:
    print("\n❌ Cannot proceed - no dataset found!")
    print("\n🔧 What to do:")
    print("   1. Go back to Cell 11 and provide some corrections")
    print("   2. Run Cell 12 to prepare the dataset")
    print("   3. Come back here and try again")
    print("\n   Or run the diagnostic cell above to debug")

🔍 Checking for prepared dataset...
✅ Found dataset from pipeline result:
   /content/personalized_chatbot/finetune_prep/batch_20251123_221025.jsonl

📊 Dataset Details:
   Path: /content/personalized_chatbot/finetune_prep/batch_20251123_221025.jsonl
   Examples: 14

⚠️  Small dataset - training will be quick but limited improvement.
   Recommendation: For better results, aim for 30+ corrections.

📊 Current GPU memory: 5.72GB / 15GB

TRAINING CONFIGURATION:
  Dataset: 14 examples
  Epochs: 2
  Expected time: 14-28 minutes
  Output: /content/personalized_chatbot/llama-lora-adapter

🚀 Start training? (y/N): y

☁️  Push to Hugging Face Hub after training?
   (y/N): y
   Enter repo name (e.g., 'username/model-name'): pierreramez/llama3.1-finetuned-v2

🚀 STARTING LORA TRAINING

📊 Initial GPU memory: Allocated: 5.72GB, Reserved: 6.98GB

🔤 Setting up tokenizer...
   Pad token: <|finetune_right_pad_id|>

📂 Preparing dataset...
📂 Loading dataset from: /content/personalized_chatbot/finetune_prep/b

Generating train split: 0 examples [00:00, ? examples/s]

   Loaded 14 examples

📄 Sample training example:
   Instruction: wow didn't know that...
   Output: Yeah, knowledge is great!...

🔧 Formatting examples...


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

🔤 Tokenizing...


Tokenizing:   0%|          | 0/14 [00:00<?, ? examples/s]

✓ Tokenized 14 examples
   Max sequence length: 512 tokens

🔧 Preparing model for LoRA training...
   This will take 2-3 minutes...
   GPU memory after prep: Allocated: 8.00GB, Reserved: 10.89GB

🎯 Adding LoRA adapters...




   Trainable params: 35,127,296 (0.75%)
   Total params: 4,663,808,000
   Memory: Allocated: 7.98GB, Reserved: 10.92GB

🏋️  Creating trainer...

🎓 Starting training...
   This will take 10-20 minutes depending on dataset size.
   Watch the loss - it should decrease.



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss



✅ Training completed!
   Final loss: 3.5014

💾 Saving adapter to /content/personalized_chatbot/llama-lora-adapter...
✓ Adapter saved locally

☁️  Pushing to Hugging Face Hub: pierreramez/llama3.1-finetuned-v2


README.md:   0%|          | 0.00/626 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          |  558kB /  141MB            

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpbeww8p7z/tokenizer.json:  98%|#########8| 16.9MB / 17.2MB            

✅ Pushed to https://huggingface.co/pierreramez/llama3.1-finetuned-v2

🎉 TRAINING COMPLETE!
Adapter location: /content/personalized_chatbot/llama-lora-adapter
Memory used: Allocated: 8.01GB, Reserved: 11.09GB

🎉 SUCCESS! Adapter saved to: /content/personalized_chatbot/llama-lora-adapter

📝 Next steps:
   1. Test the adapter (Cell 14)
   2. Merge with base model (Cell 15)
   3. Deploy updated model


###Test the Trained Adapter

In [37]:
# Cell 14: Test Trained Adapter
# Place after Cell 13D

def test_adapter_inference(adapter_path: Path, test_prompt: str):
    """
    Load the trained adapter and test it on a prompt.

    This shows you if the training actually improved the model.
    """
    from peft import PeftModel

    print("\n🧪 Loading adapter for testing...")

    # Load adapter on top of base model
    # The 'model' variable from Cell 3 is still in memory
    test_model = PeftModel.from_pretrained(
        model,  # Base model from Cell 3
        adapter_path,
        torch_dtype=torch.float16
    )

    print("✓ Adapter loaded")

    # Test generation
    print(f"\n📝 Test prompt: {test_prompt}")
    print("\n🤖 Generating response...\n")

    # Format as instruction
    formatted_prompt = f"### Instruction:\n{test_prompt}\n\n### Response:\n"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = test_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the response part (after "### Response:")
    response_text = response.split("### Response:")[-1].strip()

    print("Response:", response_text)
    print("\n" + "="*70)

    return response_text

# Check if adapter exists
adapter_path = WORKDIR / "llama-lora-adapter"

if not adapter_path.exists():
    print("❌ No adapter found. Train first using Cell 13D.")
else:
    print("✅ Adapter found!")
    print("\n🧪 Let's test it on a correction you made...")

    # Try to get a test prompt from your training data
    latest_batch = sorted(BATCH_DIR.glob("batch_*.jsonl"))[-1]
    with open(latest_batch, 'r') as f:
        first_example = json.loads(f.readline())

    test_prompt = first_example['instruction']
    expected_output = first_example['output']

    print(f"\nTest prompt (from your corrections):")
    print(f"  {test_prompt[:100]}...")
    print(f"\nExpected output (what you corrected to):")
    print(f"  {expected_output[:100]}...")

    print("\n" + "="*70)

    # Run inference
    result = test_adapter_inference(adapter_path, test_prompt)

    print("\n💡 Compare the response to your expected output.")
    print("   Does it match better than before?")

✅ Adapter found!

🧪 Let's test it on a correction you made...

Test prompt (from your corrections):
  wow didn't know that...

Expected output (what you corrected to):
  Yeah, knowledge is great!...


🧪 Loading adapter for testing...




✓ Adapter loaded

📝 Test prompt: wow didn't know that

🤖 Generating response...

Response: that's a great point, I'm glad I could share that with you. I think it's really important to share information with others so that we can all learn from each other. There's a lot of misinformation out there and we need to make sure that we're sharing accurate information. Have you ever heard of the concept of "information overload"? It's when people are exposed to so much information that it becomes hard to make sense of it all. I think it's something that we all need


💡 Compare the response to your expected output.
   Does it match better than before?


### Load Adapter for production use

In [38]:
# Cell 15: Load Adapter for Production Inference
# Place after Cell 14

from peft import PeftModel

def load_model_with_adapter(base_model, adapter_path: Path):
    """
    Load the adapter for production use.

    This replaces your original model with the fine-tuned version.
    """
    print(f"🔄 Loading adapter from {adapter_path}...")

    adapted_model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        torch_dtype=torch.float16
    )

    # Merge adapter into base model for faster inference (optional)
    print("🔗 Merging adapter with base model...")
    merged_model = adapted_model.merge_and_unload()

    print("✅ Model updated with your corrections!")
    return merged_model

# Load adapter if exists
adapter_path = WORKDIR / "llama-lora-adapter"

if adapter_path.exists():
    print("📦 Adapter found! Loading into model...")

    # Update the global 'model' variable
    model = load_model_with_adapter(model, adapter_path)

    print("\n✅ PRODUCTION MODEL UPDATED!")
    print("   Your chat loop (Cell 11) now uses the fine-tuned model.")
    print("   Go back to Cell 11 and test it!")
else:
    print("⚠️  No adapter found. Train first (Cell 13D).")

📦 Adapter found! Loading into model...
🔄 Loading adapter from /content/personalized_chatbot/llama-lora-adapter...
🔗 Merging adapter with base model...




✅ Model updated with your corrections!

✅ PRODUCTION MODEL UPDATED!
   Your chat loop (Cell 11) now uses the fine-tuned model.
   Go back to Cell 11 and test it!
