In [None]:
from __future__ import annotations

import os
from pathlib import Path

# --- Paths (Colab-friendly) ---
# Assumption: you uploaded/copied the files into the current working directory (usually /content).
DATA_JSON = Path('donnees_synthetiques.json')
PDF_PATH = Path('COLREG-Consolidated-2018.pdf')

OUT_DIR = Path('finetune_out')
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Training data choice ---
# If True: rebuild contexts using your chunking+retrieval (recommended for RAG).
# If False: use the dataset's provided `context` field directly.
USE_RETRIEVED_CONTEXT = True

# Retrieval/chunking (keep aligned with your evaluation notebook defaults)
CHUNK_TOKENS = 1024
CHUNK_OVERLAP_TOKENS = 256
RETRIEVE_K = 6
BM25_WEIGHT = 0.55
SEM_WEIGHT = 0.45
EMBEDDER_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

# --- Base model (HF) ---
# You need access to Llama 3.1 weights on Hugging Face (and an HF token).
BASE_MODEL_ID = os.getenv('BASE_MODEL_ID', 'meta-llama/Meta-Llama-3.1-8B-Instruct')
HF_TOKEN = os.getenv('HF_TOKEN')  # set in your environment if required

# --- SFT/LoRA hyperparams (start small) ---
MAX_TRAIN_SAMPLES = None  # set e.g. 500 for a quick dry-run
TRAIN_SPLIT = 0.95
SEED = 42

MAX_SEQ_LEN = 2048
BATCH_SIZE = 1
GRAD_ACCUM = 8
LR = 2e-4
EPOCHS = 1

print('CWD:', Path('.').resolve())
print('DATA_JSON:', DATA_JSON.resolve() if DATA_JSON.exists() else DATA_JSON)
print('PDF_PATH:', PDF_PATH.resolve() if PDF_PATH.exists() else PDF_PATH)
print('OUT_DIR:', OUT_DIR.resolve())
print('USE_RETRIEVED_CONTEXT:', USE_RETRIEVED_CONTEXT)
print('BASE_MODEL_ID:', BASE_MODEL_ID)

In [None]:
import json
import random

with open(DATA_JSON, 'r', encoding='utf-8') as f:
    data = json.load(f)

rows = data.get('dataset', data)
if not isinstance(rows, list):
    raise ValueError('Expected a list at key dataset or at root')

# Basic validation
clean = []
for r in rows:
    if not isinstance(r, dict):
        continue
    q = (r.get('question') or '').strip()
    a = (r.get('answer') or '').strip()
    c = (r.get('context') or '').strip()
    if q and a:
        clean.append({'question': q, 'answer': a, 'context': c})

if MAX_TRAIN_SAMPLES:
    random.seed(SEED)
    random.shuffle(clean)
    clean = clean[: int(MAX_TRAIN_SAMPLES)]

print('Loaded samples:', len(clean))
print('Example keys:', list(clean[0].keys()) if clean else None)
print('Example question:', clean[0]['question'] if clean else None)

## (Optional) Build a retrieval index from the COLREG PDF
This block recreates a lightweight version of your evaluation notebook retrieval stack: PDF → chunking → BM25 (+ optional embeddings).

Colab assumption: `COLREG-Consolidated-2018.pdf` is in the current working directory.

If `USE_RETRIEVED_CONTEXT = False`, you can skip this section.

In [None]:
import io
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import numpy as np

# PDF extraction
try:
    from pypdf import PdfReader  # type: ignore
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'pypdf'])
    from pypdf import PdfReader  # type: ignore

# BM25
try:
    from rank_bm25 import BM25Okapi  # type: ignore
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'rank-bm25'])
    from rank_bm25 import BM25Okapi  # type: ignore

try:
    from sentence_transformers import SentenceTransformer  # type: ignore
except Exception:
    SentenceTransformer = None  # type: ignore

_WORD_RE = re.compile(r'\b\w+\b', re.UNICODE)
_STOPWORDS = {
    # EN
    'the','a','an','and','or','to','of','in','on','for','with','is','are','was','were','be','as','at','by','it','this','that','from','into','over','under','not',
    # FR
    'le','la','les','un','une','des','et','ou','de','du','dans','sur','pour','avec','est','sont','été','être','ce','cet','cette','ces','pas','plus',
}

def _tokenize(text: str) -> List[str]:
    words = [w.lower() for w in _WORD_RE.findall(text or '')]
    return [w for w in words if len(w) >= 2 and w not in _STOPWORDS]

def _normalize_ws(text: str) -> str:
    return re.sub(r'\s+', ' ', (text or '')).strip()

def _pdf_bytes_to_text(data: bytes) -> str:
    reader = PdfReader(io.BytesIO(data))
    parts: List[str] = []
    for page in reader.pages:
        try:
            parts.append(page.extract_text() or '')
        except Exception:
            parts.append('')
    return '\n'.join(parts).strip()

def load_pdf_from_path(path: str) -> str:
    with open(path, 'rb') as f:
        return _pdf_bytes_to_text(f.read())

_COLREG_HEAD_RE = re.compile(r'(?im)^(rule\s+\d+\b.*|annex\s+[ivx]+\b.*|appendix\b.*)$')

def _slice_tokens(text: str, *, max_tokens: int, overlap_tokens: int) -> List[str]:
    text = _normalize_ws(text)
    if not text:
        return []
    toks = text.split()
    out: List[str] = []
    start = 0
    n = len(toks)
    max_tokens = max(1, int(max_tokens))
    overlap_tokens = max(0, int(overlap_tokens))
    while start < n:
        end = min(n, start + max_tokens)
        out.append(' '.join(toks[start:end]))
        if end >= n:
            break
        start = max(0, end - overlap_tokens)
    return out

def chunk_colreg(text: str, *, chunk_tokens: int, overlap_tokens: int) -> List[Tuple[str, str]]:
    raw = (text or '').replace('\r\n', '\n')
    lines = [ln.strip() for ln in raw.split('\n')]
    headings: List[Tuple[int, str]] = []
    for i, ln in enumerate(lines):
        if not ln:
            continue
        m = _COLREG_HEAD_RE.match(ln)
        if m:
            headings.append((i, m.group(1).strip()))
    if not headings:
        joined = _normalize_ws(raw)
        return [('Document', ch) for ch in _slice_tokens(joined, max_tokens=chunk_tokens, overlap_tokens=overlap_tokens)]

    chunks: List[Tuple[str, str]] = []
    for idx, (start_i, title) in enumerate(headings):
        end_i = headings[idx + 1][0] if idx + 1 < len(headings) else len(lines)
        section_text = _normalize_ws(' '.join([ln for ln in lines[start_i:end_i] if ln]))
        if not section_text:
            continue
        for ch in _slice_tokens(section_text, max_tokens=chunk_tokens, overlap_tokens=overlap_tokens):
            chunks.append((title, ch))
    return chunks

def _minmax(scores: np.ndarray) -> np.ndarray:
    if scores.size == 0:
        return scores
    mn = float(scores.min())
    mx = float(scores.max())
    if mx - mn < 1e-9:
        return np.zeros_like(scores, dtype=float)
    return (scores - mn) / (mx - mn)

@dataclass
class Chunk:
    idx: int
    section: str
    text: str

@dataclass
class AttachedPDF:
    name: str
    text: str
    chunks: List[Chunk]
    bm25: BM25Okapi
    bm25_tokens: List[List[str]]
    embedder_name: str
    embeddings: Optional[np.ndarray]

ATTACHED_PDF: Optional[AttachedPDF] = None
_EMBEDDER: Optional['SentenceTransformer'] = None  # type: ignore

def _get_embedder() -> Optional['SentenceTransformer']:
    global _EMBEDDER
    if SentenceTransformer is None:
        return None
    if _EMBEDDER is None:
        _EMBEDDER = SentenceTransformer(EMBEDDER_NAME)
    return _EMBEDDER

def attach_pdf_text(name: str, text: str) -> None:
    global ATTACHED_PDF
    pairs = chunk_colreg(text, chunk_tokens=CHUNK_TOKENS, overlap_tokens=CHUNK_OVERLAP_TOKENS)
    chunks = [Chunk(idx=i, section=sec, text=ch) for i, (sec, ch) in enumerate(pairs)]
    bm25_tokens = [_tokenize(c.text) for c in chunks]
    bm25 = BM25Okapi(bm25_tokens)

    embedder = _get_embedder()
    embeddings: Optional[np.ndarray] = None
    if embedder is not None:
        docs = [f'{c.section}. {c.text}' for c in chunks]
        emb = embedder.encode(docs, normalize_embeddings=True, show_progress_bar=False)
        embeddings = np.asarray(emb, dtype=np.float32)

    ATTACHED_PDF = AttachedPDF(
        name=name,
        text=text,
        chunks=chunks,
        bm25=bm25,
        bm25_tokens=bm25_tokens,
        embedder_name=EMBEDDER_NAME,
        embeddings=embeddings,
    )
    print(f'Attached PDF: {name} ({len(text):,} chars, {len(chunks)} chunks)')
    if embeddings is None:
        print('(Embeddings disabled: sentence-transformers not available)')

def _hybrid_scores(question: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    assert ATTACHED_PDF is not None
    q_tokens = _tokenize(question)
    bm25_scores = np.asarray(ATTACHED_PDF.bm25.get_scores(q_tokens), dtype=float)
    bm25_norm = _minmax(bm25_scores)

    sem_norm = np.zeros_like(bm25_norm, dtype=float)
    if ATTACHED_PDF.embeddings is not None:
        embedder = _get_embedder()
        if embedder is not None:
            q_emb = embedder.encode([question], normalize_embeddings=True, show_progress_bar=False)
            q_emb = np.asarray(q_emb[0], dtype=np.float32)
            sem = np.dot(ATTACHED_PDF.embeddings, q_emb).astype(float)
            sem_norm = _minmax(sem)

    hybrid = BM25_WEIGHT * bm25_norm + SEM_WEIGHT * sem_norm
    return hybrid, bm25_norm, sem_norm

def retrieve_context(question: str, *, k: int) -> List[int]:
    if ATTACHED_PDF is None:
        return []
    hybrid, _, _ = _hybrid_scores(question)
    ranked = np.argsort(-hybrid)[: max(20, k)].tolist()
    ranked = [i for i in ranked if hybrid[i] > 0]
    return ranked[:k]

def build_excerpts(indices: List[int]) -> str:
    assert ATTACHED_PDF is not None
    if not indices:
        return ''
    blocks: List[str] = []
    for idx in indices:
        c = ATTACHED_PDF.chunks[idx]
        blocks.append(f'(Chunk {idx+1}) {c.text}')
    return '\n'.join(blocks).strip()

if USE_RETRIEVED_CONTEXT:
    if not PDF_PATH.exists():
        raise FileNotFoundError(f'Missing PDF: {PDF_PATH}')
    text = load_pdf_from_path(str(PDF_PATH))
    attach_pdf_text(PDF_PATH.name, text)

## Build the fine-tuning dataset (chat format)
We will train the model to answer using **ONLY** the provided excerpts and to cite chunks like `(Chunk 12)`.

This aligns with your evaluation strategy and encourages better grounded behavior during RAG.

In [None]:
import math

SYSTEM_PROMPT = (
    'You are a COLREGS assistant. Answer the QUESTION using ONLY the provided EXCERPTS. '
    'If the EXCERPTS do not contain the answer, say you do not know. '
    'Cite evidence using parentheses like (Chunk 12). '
    'Do not invent rule text.'
)

def make_example(question: str, answer: str, *, context_text: str) -> dict:
    user = (
        'EXCERPTS:\n'
        + (context_text.strip() if context_text.strip() else '[No relevant excerpts found]')
        + '\n\nQUESTION:\n'
        + question.strip()
    )
    return {
        'messages': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': user},
            {'role': 'assistant', 'content': answer.strip()},
        ]
    }

examples = []
for r in clean:
    q = r['question']
    a = r['answer']
    if USE_RETRIEVED_CONTEXT:
        idxs = retrieve_context(q, k=RETRIEVE_K)
        ctx = build_excerpts(idxs)
    else:
        ctx = r.get('context', '')
    examples.append(make_example(q, a, context_text=ctx))

random.seed(SEED)
random.shuffle(examples)
n_train = max(1, int(math.floor(len(examples) * TRAIN_SPLIT)))
train_examples = examples[:n_train]
eval_examples = examples[n_train:]

train_path = OUT_DIR / 'train.jsonl'
eval_path = OUT_DIR / 'eval.jsonl'

with open(train_path, 'w', encoding='utf-8') as f:
    for ex in train_examples:
        f.write(json.dumps(ex, ensure_ascii=False) + '\n')
with open(eval_path, 'w', encoding='utf-8') as f:
    for ex in eval_examples:
        f.write(json.dumps(ex, ensure_ascii=False) + '\n')

print('Train examples:', len(train_examples))
print('Eval examples:', len(eval_examples))
print('Wrote:', train_path)
print('Wrote:', eval_path)
print('Sample user message (truncated):')
print(train_examples[0]['messages'][1]['content'][:400])

## Fine-tune (Option A): Transformers + LoRA/QLoRA (recommended if you have an NVIDIA GPU)
This trains a LoRA adapter on the HF base model, then merges it into full weights.

If this fails on Windows (common causes: CUDA/bitsandbytes), skip to the **GGUF conversion** section after training elsewhere, or use a Linux environment/WSL.

In [None]:
# Install training dependencies
# NOTE: On Windows, GPU QLoRA may require a specific CUDA + PyTorch build.
import sys, subprocess

pkgs = [
    'torch',
    'transformers>=4.45.0',
    'datasets>=2.19.0',
    'accelerate>=0.34.0',
    'peft>=0.12.0',
    'trl>=0.11.0',
]
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + pkgs)

# Optional (QLoRA). If it fails to install/use, the notebook will fall back to regular LoRA.
try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'bitsandbytes'])
    HAS_BNB = True
except Exception:
    HAS_BNB = False

print('bitsandbytes available:', HAS_BNB)

In [None]:
import torch
import os
from pathlib import Path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig 

# --- Configuration Setup ---
# (Assuming OUT_DIR, BASE_MODEL_ID, HF_TOKEN, etc., are defined above)

# 1. Load Dataset
ds = load_dataset('json', data_files={'train': str(train_path), 'eval': str(eval_path)})

# 2. Tokenizer Setup
tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
# Important for some models to ensure padding is on the correct side
tok.padding_side = 'right' 

# 3. Model with QLoRA
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    bnb_4bit_use_double_quant=True,
) if (HAS_BNB and torch.cuda.is_available()) else None

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    token=HF_TOKEN,
    device_map='auto',
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    quantization_config=quantization_config,
)

# 4. LoRA Configuration
lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
)

# 5. Dataset Formatting
def format_chat(example):
    text = tok.apply_chat_template(example['messages'], tokenize=False, add_generation_prompt=False)
    return {'text': text}

train_ds = ds['train'].map(format_chat, remove_columns=ds['train'].column_names)
eval_ds = ds['eval'].map(format_chat, remove_columns=ds['eval'].column_names) if 'eval' in ds else None

# 6. SFTConfig (The "Modern" TrainingArguments)
args = SFTConfig(
    output_dir=str(OUT_DIR / 'runs'),
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    eval_strategy='steps' if (eval_ds is not None) else 'no',
    eval_steps=200 if (eval_ds is not None) else None,
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(),
    report_to=[],
    seed=SEED,
    # SFT Specific parameters moved into Config
    max_length=MAX_SEQ_LEN,
    dataset_text_field="text",
)



# 7. Trainer Initialization
trainer = SFTTrainer(
    model=model,
    processing_class=tok, # Corrected keyword
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=lora,
    args=args,
)

# 8. Run Training
trainer.train()

# 9. Save final adapter
adapter_dir = OUT_DIR / 'lora_adapter'
trainer.save_model(str(adapter_dir)) # trainer.save_model is cleaner than trainer.model.save_pretrained
print('Saved LoRA adapter:', adapter_dir)

In [None]:
# Merge LoRA into full weights (required before GGUF conversion)
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

merged_dir = OUT_DIR / 'merged_hf_model'
merged_dir.mkdir(parents=True, exist_ok=True)

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    token=HF_TOKEN,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map='cpu',
)
peft = PeftModel.from_pretrained(base, str(adapter_dir))
merged = peft.merge_and_unload()

tok2 = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
if tok2.pad_token is None:
    tok2.pad_token = tok2.eos_token

merged.save_pretrained(merged_dir, safe_serialization=True)
tok2.save_pretrained(merged_dir)
print('Merged HF model saved:', merged_dir)

## Convert merged HF model to GGUF + quantize (llama.cpp)
This produces a `.gguf` you can use with llama.cpp and (optionally) Ollama.

This step clones and builds llama.cpp (needs `git`, `cmake`, and a C++ compiler installed).

In [None]:
import os
import subprocess

LLAMA_CPP_DIR = OUT_DIR / 'llama.cpp'

if not LLAMA_CPP_DIR.exists():
    subprocess.check_call(['git', 'clone', '--depth', '1', 'https://github.com/ggerganov/llama.cpp', str(LLAMA_CPP_DIR)])

# Build (Windows): this uses CMake. You may need to adjust generator/toolchain.
build_dir = LLAMA_CPP_DIR / 'build'
build_dir.mkdir(exist_ok=True)

# Configure
subprocess.check_call(['cmake', '-S', str(LLAMA_CPP_DIR), '-B', str(build_dir)])
# Build
subprocess.check_call(['cmake', '--build', str(build_dir), '--config', 'Release'])

print('llama.cpp built at:', build_dir)

In [None]:
# Convert HF -> GGUF
import subprocess

# llama.cpp conversion script path can vary slightly by version
convert_py = LLAMA_CPP_DIR / 'convert_hf_to_gguf.py'
if not convert_py.exists():
    # fallback for older layouts
    convert_py = LLAMA_CPP_DIR / 'convert.py'

if not convert_py.exists():
    raise FileNotFoundError('Could not find llama.cpp HF->GGUF conversion script')

gguf_f16 = OUT_DIR / 'colregs_rag_finetuned-f16.gguf'
subprocess.check_call([
    'python', str(convert_py),
    str(merged_dir),
    '--outfile', str(gguf_f16),
])

print('Wrote:', gguf_f16)

In [None]:
# Quantize to Q4_K_M (similar to your existing file)
import subprocess

# quantize binary is typically in build/bin/quantize (or build/Release/quantize.exe on Windows)
quant_candidates = [
    OUT_DIR / 'llama.cpp' / 'build' / 'bin' / 'llama-quantize',
    OUT_DIR / 'llama.cpp' / 'build' / 'bin' / 'quantize.exe',
    OUT_DIR / 'llama.cpp' / 'build' / 'Release' / 'quantize.exe',
]
quant_bin = next((p for p in quant_candidates if p.exists()), None)
if quant_bin is None:
    raise FileNotFoundError('Could not locate llama.cpp quantize binary. Check your build output.')

gguf_q4 = OUT_DIR / 'colregs_rag_finetuned-Q4_K_M.gguf'
subprocess.check_call([str(quant_bin), str(gguf_f16), str(gguf_q4), 'Q4_K_M'])
print('Wrote:', gguf_q4)

## Next: use the fine-tuned GGUF
- For llama.cpp: point your runner to `colregs_rag_finetuned-Q4_K_M.gguf`
- For Ollama: you can create a new model using a Modelfile that references the GGUF (Ollama support depends on your version).

If you want, I can also adapt your existing evaluation notebook to automatically switch between the base model and the fine-tuned GGUF for side-by-side RAG evaluation.