# DeepSeek-V3.2 Chatbot

This notebook runs a minimal chatbot against the Hugging Face hosted model `deepseek-ai/DeepSeek-V3.2`.

Prereq: set an access token in your environment as `HF_TOKEN` (recommended) or set it in the cell below (not recommended for shared notebooks).

In [3]:
# Token setup
import os

# Recommended: set HF_TOKEN in your OS environment and restart the kernel.
# Example (PowerShell): setx HF_TOKEN "hf_..."
os.environ["HF_TOKEN"] = "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"

In [16]:
# Quick connectivity check (does NOT print your token)
import os, requests

hf = os.getenv("HF_TOKEN")
print("HF_TOKEN set?", bool(hf), "len=", len(hf) if hf else None)

# Router should exist (401/404/200 depends on auth/model availability)
url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-V3.2"
headers = {"Authorization": f"Bearer {hf}"} if hf else {}
r = requests.get(url, headers=headers, timeout=20)
print("GET", url, "->", r.status_code)
print((r.text or "")[:200])

HF_TOKEN set? True len= 37
GET https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-V3.2 -> 404
Not Found


In [17]:
# Test OpenAI-compatible router endpoint (optional)
import os, requests, json

hf = os.getenv("HF_TOKEN")
url = "https://router.huggingface.co/v1/chat/completions"
payload = {
    "model": "deepseek-ai/DeepSeek-V3.2",
    "messages": [{"role": "user", "content": "Say hello in one sentence."}],
    "max_tokens": 32,
    "temperature": 0.2,
}
headers = {"Authorization": f"Bearer {hf}", "Content-Type": "application/json"}
resp = requests.post(url, headers=headers, json=payload, timeout=30)
print("POST", url, "->", resp.status_code)
print((resp.text or "")[:300])

POST https://router.huggingface.co/v1/chat/completions -> 200
{"id":"58a044cf2762af690d388558c739250f","object":"chat.completion","created":1767611248,"model":"deepseek/deepseek-v3.2","choices":[{"index":0,"message":{"role":"assistant","content":"Hello, it's a pleasure to meet you."},"finish_reason":"stop"}],"usage":{"prompt_tokens":10,"completion_tokens":11,"


In [18]:
import importlib
import deepseek_chat

importlib.reload(deepseek_chat)
from deepseek_chat import DeepSeekChatbot

bot = DeepSeekChatbot(model="deepseek-ai/DeepSeek-V3.2")
print("Ready. Try: bot.reply('Hello!')")

Ready. Try: bot.reply('Hello!')


In [7]:
# --- PDF attachment (RAG-lite) ---
from __future__ import annotations

import io
import re
from dataclasses import dataclass
from typing import List, Tuple

# Install lightweight deps if missing (safe to re-run)
try:
    from pypdf import PdfReader  # type: ignore
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pypdf"])
    from pypdf import PdfReader  # type: ignore

try:
    import ipywidgets as widgets  # type: ignore
    from IPython.display import display  # type: ignore
except Exception:
    widgets = None
    display = None


_WORD_RE = re.compile(r"\b\w+\b", re.UNICODE)
_STOPWORDS = {
    "the","a","an","and","or","to","of","in","on","for","with","is","are","was","were","be","as","at","by","it","this","that",
    "le","la","les","un","une","des","et","ou","de","du","dans","sur","pour","avec","est","sont","été","être","ce","cet","cette","ces",
}


def _tokenize(text: str) -> List[str]:
    words = [w.lower() for w in _WORD_RE.findall(text)]
    return [w for w in words if len(w) >= 2 and w not in _STOPWORDS]


def _pdf_bytes_to_text(data: bytes) -> str:
    reader = PdfReader(io.BytesIO(data))
    parts: List[str] = []
    for page in reader.pages:
        try:
            parts.append(page.extract_text() or "")
        except Exception:
            parts.append("")
    return "\n".join(parts).strip()


def load_pdf_from_path(path: str) -> str:
    with open(path, "rb") as f:
        return _pdf_bytes_to_text(f.read())


def chunk_text(text: str, *, max_chars: int = 1800, overlap: int = 250) -> List[str]:
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []
    chunks: List[str] = []
    start = 0
    n = len(text)
    while start < n:
        end = min(n, start + max_chars)
        chunks.append(text[start:end])
        if end >= n:
            break
        start = max(0, end - overlap)
    return chunks


def _score(query_tokens: List[str], chunk: str) -> int:
    # Simple lexical overlap score (fast, no embeddings).
    chunk_tokens = _tokenize(chunk)
    if not chunk_tokens or not query_tokens:
        return 0
    chunk_set = set(chunk_tokens)
    score = 0
    for t in query_tokens:
        if t in chunk_set:
            score += 2
    return score


def retrieve_context(question: str, chunks: List[str], *, k: int = 4) -> List[Tuple[int, str]]:
    q = _tokenize(question)
    scored = [(i, _score(q, ch)) for i, ch in enumerate(chunks)]
    scored.sort(key=lambda x: x[1], reverse=True)
    top = [(i, chunks[i]) for i, s in scored[:k] if s > 0]
    return top


def build_context(excerpts: List[Tuple[int, str]]) -> str:
    if not excerpts:
        return ""
    lines: List[str] = []
    for i, ch in excerpts:
        lines.append(f"[Chunk {i+1}] {ch}")
    return "\n\n".join(lines)


@dataclass
class AttachedPDF:
    name: str
    text: str
    chunks: List[str]


ATTACHED_PDF: AttachedPDF | None = None
RETRIEVE_K = 4


def attach_pdf_text(name: str, text: str) -> None:
    global ATTACHED_PDF
    chunks = chunk_text(text)
    ATTACHED_PDF = AttachedPDF(name=name, text=text, chunks=chunks)
    print(f"Attached PDF: {name} ({len(text):,} chars, {len(chunks)} chunks)")


def clear_pdf() -> None:
    global ATTACHED_PDF
    ATTACHED_PDF = None
    print("PDF cleared")


def ask_on_pdf(question: str, *, k: int | None = None) -> str:
    if not ATTACHED_PDF:
        return bot.reply(question)
    k = RETRIEVE_K if k is None else k
    excerpts = retrieve_context(question, ATTACHED_PDF.chunks, k=k)
    context = build_context(excerpts)
    prompt = (
        "You are evaluating a model. Answer the user's question using ONLY the provided document excerpts. "
        "If the excerpts do not contain the answer, say you don't know and ask what to look for.\n\n"
        f"Document: {ATTACHED_PDF.name}\n\n"
        f"EXCERPTS:\n{context if context else '[No relevant excerpts found]'}\n\n"
        f"QUESTION:\n{question}"
    )
    return bot.reply(prompt)


def _extract_first_upload(value) -> tuple[str, bytes] | None:
    """Return (filename, content_bytes) for the first uploaded file.

    ipywidgets has changed FileUpload.value across versions: it can be a dict, tuple, or list.
    """
    if not value:
        return None
    # Case 1: dict keyed by filename -> {"file.pdf": {"content": b"...", ...}}
    if isinstance(value, dict):
        first_key = next(iter(value.keys()))
        first = value[first_key]
        if isinstance(first, dict) and "content" in first:
            name = first.get("name") or first_key
            return name, first["content"]
        # Sometimes it's already a dict with content
        if isinstance(first, (bytes, bytearray)):
            return first_key, bytes(first)
    # Case 2: tuple/list of dicts -> ({"name":..., "content":...}, ...)
    if isinstance(value, (tuple, list)):
        first = value[0]
        if isinstance(first, dict) and "content" in first:
            name = first.get("name") or "uploaded.pdf"
            return name, first["content"]
        # Case 3: UploadedFile-like object
        name = getattr(first, "name", "uploaded.pdf")
        content = getattr(first, "content", None)
        if content is None:
            content = getattr(first, "data", None)
        if content is None and isinstance(first, (bytes, bytearray)):
            content = bytes(first)
        if isinstance(content, (bytes, bytearray)):
            return name, bytes(content)
    # Fallback: try attributes on the value itself
    name = getattr(value, "name", "uploaded.pdf")
    content = getattr(value, "content", None)
    if isinstance(content, (bytes, bytearray)):
        return name, bytes(content)
    return None


def show_pdf_uploader() -> None:
    if widgets is None or display is None:
        print("ipywidgets not available in this environment. Use attach_pdf_text(load_pdf_from_path(...))")
        return
    uploader = widgets.FileUpload(accept=".pdf", multiple=False)
    button = widgets.Button(description="Load PDF")
    out = widgets.Output()

    def _on_click(_):
        with out:
            out.clear_output()
            item = _extract_first_upload(uploader.value)
            if not item:
                print("Select a PDF first")
                return
            name, data = item
            text = _pdf_bytes_to_text(data)
            attach_pdf_text(name, text)

    button.on_click(_on_click)
    display(widgets.VBox([widgets.HTML("<b>Attach a PDF</b>"), uploader, button, out]))


# Run this to attach a PDF via UI (if supported):
# show_pdf_uploader()

# Or attach from a local path (Windows example):
# attach_pdf_text("my.pdf", load_pdf_from_path(r"C:\\path\\to\\file.pdf"))

In [None]:
show_pdf_uploader()

VBox(children=(HTML(value='<b>Attach a PDF</b>'), FileUpload(value=(), accept='.pdf', description='Upload'), B…

In [9]:
def chat_loop():
    """Interactive loop.

    Commands:
      /reset       clear chat history
      /exit        quit
      /pdf         show PDF status
      /clearpdf    detach current PDF
      /k N         set retrieval chunks (default 4)
    """
    global RETRIEVE_K
    print("Type '/reset', '/exit'. Use '/pdf' after attaching a PDF.")
    while True:
        user = input("You> ").strip()
        if not user:
            continue
        low = user.lower()
        if low in {"/exit", "/quit"}:
            break
        if low == "/reset":
            bot.reset()
            print("(history cleared)")
            continue
        if low == "/pdf":
            if ATTACHED_PDF:
                print(f"(pdf attached: {ATTACHED_PDF.name}, {len(ATTACHED_PDF.chunks)} chunks, k={RETRIEVE_K})")
            else:
                print("(no pdf attached) -> run show_pdf_uploader() or attach_pdf_text(...)")
            continue
        if low == "/clearpdf":
            clear_pdf()
            continue
        if low.startswith("/k "):
            try:
                RETRIEVE_K = max(1, int(low.split()[1]))
                print(f"(k set to {RETRIEVE_K})")
            except Exception:
                print("Usage: /k 4")
            continue

        answer = ask_on_pdf(user) if ATTACHED_PDF else bot.reply(user)
        print(f"Bot> {answer}\n")


# Run interactive chat in the notebook output
# chat_loop()

In [19]:
chat_loop()

Type '/reset', '/exit'. Use '/pdf' after attaching a PDF.
(pdf attached: COLREG-Consolidated-2018.pdf, 66 chunks, k=4)
Bot> According to the provided document excerpts, "turning to port" is indicated by **two short blasts** on the whistle.

**Source:** Chunk 32 states: "two short blasts to mean 'I am altering my course to port'".

Bot> Based solely on the provided excerpts, the signal for "turning to port" is **two short blasts** on the whistle.

**Source:** Chunk 32 explicitly states: "two short blasts to mean 'I am altering my course to port'".

Bot> Based solely on the provided excerpts, the signal for "overtake on starboard" is **two prolonged blasts followed by one short blast** on the whistle.

**Source:** Chunk 33 states: "two prolonged blasts followed by one short blast to mean 'I intend to overtake you on your starboard side'".



In [20]:
# --- Batch evaluation from CSV (RAG) ---
from pathlib import Path

import pandas as pd


def _find_existing_path(*candidates: str) -> Path:
    for c in candidates:
        p = Path(c)
        if p.exists():
            return p.resolve()
    raise FileNotFoundError(f"None of these paths exist: {candidates}")


def _read_csv_robust(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        # Common fallbacks for Windows CSV exports
        for enc in ("utf-8-sig", "cp1252", "latin1"):
            try:
                return pd.read_csv(path, encoding=enc)
            except Exception:
                pass
        raise


# Notebook usually runs with CWD = DeepSeek_test/. These files are at the workspace root.
CSV_PATH = _find_existing_path(
    "Set évaluation RAG.csv",
    str(Path("..") / "Set évaluation RAG.csv"),
    str(Path("..") / ".." / "Set évaluation RAG.csv"),
    r"C:\Users\celli\Documents\.PIP2026\PIP-2026-LOTUSim_G5_Fatigue\Set évaluation RAG.csv",
)
PDF_PATH = _find_existing_path(
    "COLREG-Consolidated-2018.pdf",
    str(Path("..") / "COLREG-Consolidated-2018.pdf"),
    str(Path("..") / ".." / "COLREG-Consolidated-2018.pdf"),
    r"C:\Users\celli\Documents\.PIP2026\PIP-2026-LOTUSim_G5_Fatigue\COLREG-Consolidated-2018.pdf",
)

print("CSV:", CSV_PATH)
print("PDF:", PDF_PATH)

# Load CSV and keep only the Question column for now
df_questions = _read_csv_robust(CSV_PATH)
if "Question" not in df_questions.columns:
    raise KeyError(f"CSV must have a 'Question' column. Found: {list(df_questions.columns)}")
questions = df_questions["Question"].dropna().astype(str).tolist()
print(f"Loaded {len(questions)} questions")

# Attach the PDF (only if not already attached or if you want to force reload)
if ATTACHED_PDF is None or (ATTACHED_PDF.name != PDF_PATH.name):
    attach_pdf_text(PDF_PATH.name, load_pdf_from_path(str(PDF_PATH)))

CSV: C:\Users\celli\Documents\.PIP2026\PIP-2026-LOTUSim_G5_Fatigue\Set évaluation RAG.csv
PDF: C:\Users\celli\Documents\.PIP2026\PIP-2026-LOTUSim_G5_Fatigue\COLREG-Consolidated-2018.pdf
Loaded 12 questions


In [21]:
# Run the evaluation: ask each question on the attached PDF
MAX_QUESTIONS = None  # set to an int for a quick dry-run, e.g. 5

questions_to_run = questions if not MAX_QUESTIONS else questions[:MAX_QUESTIONS]
results = []
total = len(questions_to_run)
for i, q in enumerate(questions_to_run, start=1):
    bot.reset()  # avoid history leakage across questions
    ans = ask_on_pdf(q)
    results.append({"Question": q, "Answer": ans})
    if i % 10 == 0 or i == total:
        print(f"Done {i}/{total}")

df_results = pd.DataFrame(results)
df_results.head()

# Save results next to the notebook
OUT_PATH = Path("rag_eval_answers.csv").resolve()
df_results.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

Done 10/12
Done 12/12
Saved: C:\Users\celli\Documents\.PIP2026\PIP-2026-LOTUSim_G5_Fatigue\DeepSeek_test\rag_eval_answers.csv
