# WordSmith

This notebook contains a self-contained version of the **WordSmith** chatbot.

It implements all three services in one place:
1. Dictionary lookup via the Free Dictionary API
2. Semantic Q&A over local notes using ChromaDB + OpenAI embeddings
3. Online lookup using Wiktionary + Wikipedia summary endpoints

Run the cells in order. When you reach the last cell, a Gradio app will launch in this notebook environment.

In [1]:
# Install dependencies (run once per environment)
import sys
!{sys.executable} -m pip install gradio chromadb openai requests




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from typing import List, Tuple

BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")
NOTES_DIR = os.path.join(DATA_DIR, "notes")
CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db")

os.makedirs(NOTES_DIR, exist_ok=True)
os.makedirs(CHROMA_DIR, exist_ok=True)

print("Notes dir:", NOTES_DIR)
print("Chroma dir:", CHROMA_DIR)

def load_api_key_from_secrets():
    secrets_path = os.path.join(BASE_DIR, ".secrets.template")
    if not os.path.exists(secrets_path):
        print("No .secrets.template file found; relying on existing OPENAI_API_KEY.")
        return
    with open(secrets_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("OPENAI_API_KEY="):
                key = line.split("=", 1)[1].strip()
                os.environ["OPENAI_API_KEY"] = key
                print("Loaded OPENAI_API_KEY from .secrets.template")
                return
    print("OPENAI_API_KEY not found in .secrets.template")

load_api_key_from_secrets()
assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY must be set to use embeddings and chat."

Notes dir: c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\notes
Chroma dir: c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\chroma_db
Loaded OPENAI_API_KEY from .secrets.template


## Build Chroma index from notes

In [3]:
import os
import math
from typing import List, Dict, Any
import numpy as np
from openai import OpenAI

# Assumes NOTES_DIR is already defined somewhere above
# NOTES_DIR = "..." 

EMBED_MODEL = "text-embedding-3-small"

_openai_client = OpenAI()

# Global in-memory index
notes_index: Dict[str, Any] = {
    "embeddings": None,   # np.ndarray [N, d]
    "texts": [],          # list[str]
    "meta": [],           # list[dict]
}

def chunk_text(text: str, max_chars: int = 800, overlap: int = 200) -> List[str]:
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        chunk = text[start:end]
        chunks.append(chunk)
        if end == n:
            break
        start = end - overlap
    return chunks

def build_notes_index():
    """
    Read all .txt files from NOTES_DIR, chunk them, get embeddings from OpenAI,
    and store everything in the global notes_index dict.
    """
    global notes_index

    all_texts: List[str] = []
    all_meta: List[Dict[str, Any]] = []

    print("Reading notes from:", NOTES_DIR)
    if not os.path.isdir(NOTES_DIR):
        print("NOTES_DIR does not exist:", NOTES_DIR)
        return

    doc_id = 0
    for fname in os.listdir(NOTES_DIR):
        if not fname.lower().endswith(".txt"):
            continue
        path = os.path.join(NOTES_DIR, fname)
        print("  Reading", path)
        with open(path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        chunks = chunk_text(text)
        print(f"    -> {len(chunks)} chunks from {fname}")

        for i, ch in enumerate(chunks):
            if not ch.strip():
                continue
            all_texts.append(ch)
            all_meta.append({"source": fname, "chunk": i})
        doc_id += 1

    if not all_texts:
        print("No non-empty chunks found; index will be empty.")
        return

    print(f"Prepared {len(all_texts)} chunks. Creating embeddings...")

    resp = _openai_client.embeddings.create(
        model=EMBED_MODEL,
        input=all_texts,
    )
    embs = np.array([e.embedding for e in resp.data], dtype="float32")
    print("Success! Embeddings shape:", embs.shape)

    # Normalize for cosine similarity
    norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-8
    embs_norm = embs / norms

    notes_index["embeddings"] = embs_norm
    notes_index["texts"] = all_texts
    notes_index["meta"] = all_meta

    print("Done! Stored", len(all_texts), "chunks in in-memory index.")

def search_notes(query: str, k: int = 3):
    """
    Embed the query and return the top-k most similar chunks from notes_index.
    """
    if notes_index["embeddings"] is None:
        print("Index is empty. Run build_notes_index() first.")
        return []

    q_resp = _openai_client.embeddings.create(
        model=EMBED_MODEL,
        input=[query],
    )
    q_emb = np.array(q_resp.data[0].embedding, dtype="float32")
    q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-8)

    # Cosine similarity because everything is unit-normalized
    embs = notes_index["embeddings"]
    sims = embs @ q_emb  # [N]

    top_idx = np.argsort(-sims)[:k]
    results = []
    for idx in top_idx:
        results.append(
            {
                "score": float(sims[idx]),
                "text": notes_index["texts"][idx],
                "meta": notes_index["meta"][idx],
            }
        )
    return results

# Build the index now
build_notes_index()

Reading notes from: c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\notes
  Reading c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\notes\faq.txt
    -> 0 chunks from faq.txt
  Reading c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\notes\lecture1.txt
    -> 1 chunks from lecture1.txt
  Reading c:\Users\burni\OneDrive\Desktop\deploying-ai\05_src\assignment_chat\data\notes\lecture2.txt
    -> 0 chunks from lecture2.txt
Prepared 1 chunks. Creating embeddings...
Success! Embeddings shape: (1, 1536)
Done! Stored 1 chunks in in-memory index.


## Service 1: Dictionary lookup (Free Dictionary API)

In [4]:
import requests

def define_word_nicely(word: str) -> str:
    word = word.strip()
    if not word:
        return "Please tell me which word you want me to define."

    url = f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}"
    try:
        resp = requests.get(url, timeout=8)
        if resp.status_code != 200:
            return f"I couldn't find a definition for **{word}** in the dictionary API."
        data = resp.json()
    except Exception:
        return "Something went wrong when contacting the dictionary API. Please try again later."

    if not isinstance(data, list) or not data:
        return f"I couldn't find a definition for **{word}**."
    entry = data[0]
    meanings = entry.get("meanings", [])
    if not meanings:
        return f"The API didn't return any meanings for **{word}**."

    lines = [f"### Definition of **{word}**\n"]
    for m in meanings:
        pos = m.get("partOfSpeech", "")
        defs = m.get("definitions", [])
        if not defs:
            continue
        lines.append(f"**{pos.capitalize()}**:")
        for d in defs[:2]:
            definition = d.get("definition", "(no definition)")
            example = d.get("example")
            lines.append(f"- {definition}")
            if example:
                lines.append(f"  _Example:_ {example}")
        lines.append("")
    lines.append("If you want, I can also help you use it in a sentence!")
    return "\n".join(lines)

## Service 2: Semantic Q&A over notes (Chroma + OpenAI)

In [5]:
def retrieve_relevant_chunks(question: str, k: int = 4):
    collection = _chroma_client.get_or_create_collection(
        name=COLLECTION_NAME,
        metadata={"hnsw:space": "cosine"},
    )
    emb = _openai_client.embeddings.create(
        model=EMBED_MODEL,
        input=[question],
    ).data[0].embedding
    res = collection.query(query_embeddings=[emb], n_results=k)
    docs = res.get("documents", [[]])[0]
    metas = res.get("metadatas", [[]])[0]
    return list(zip(docs, [m.get("source", "notes") for m in metas]))

CHAT_MODEL = "gpt-4o-mini"

from typing import List, Tuple

def answer_with_notes(question: str, history: List[Tuple[str, str]]) -> str:
    try:
        chunks = retrieve_relevant_chunks(question, k=4)
    except Exception as e:
        print("Error during Chroma retrieval:", repr(e))
        return "I tried to search your notes but something went wrong with the vector database."

    if not chunks:
        return (
            "It looks like your question is about '" + question + "', but I don’t have notes on that topic. "
            "However, I can still try to give a short high-level explanation if you’d like!"
        )

    context_parts = []
    for text, source in chunks:
        context_parts.append(f"From {source}:\n{text}")
    context = "\n\n---\n\n".join(context_parts)

    history_snippets = []
    for u, a in history[-2:]:
        history_snippets.append(f"User: {u}\nAssistant: {a}")
    history_text = "\n\n".join(history_snippets)

    system_prompt = (
        "You are WordSmith, a friendly study assistant. You answer questions using ONLY the notes "
        "provided in the context. If the notes are missing information, you say so honestly."
    )
    user_content = (
        f"User question:\n{question}\n\n"
        f"Recent conversation:\n{history_text}\n\n"
        f"Relevant notes:\n{context}\n\n"
        "Now answer the user's question using only these notes."
    )

    try:
        resp = _openai_client.chat.completions.create(
            model=CHAT_MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
            ],
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        print("Error during OpenAI chat call:", repr(e))
        return "Something went wrong when I tried to answer from your notes. Please try again later."

## Service 3: Online lookup (Wiktionary + Wikipedia)

In [6]:
USER_AGENT = "WordSmithBot/1.0"

def _fetch_wikimedia_summary(base: str, topic: str):
    url = f"{base}/{topic}"
    try:
        resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=5)
    except requests.exceptions.RequestException:
        return None
    if resp.status_code != 200:
        return None
    try:
        data = resp.json()
    except ValueError:
        return None
    title = data.get("title") or topic.replace("_", " ")
    extract = data.get("extract") or data.get("description")
    if not extract:
        return None
    return title, extract

def lookup_word_online(word: str) -> str:
    word = word.strip()
    if not word:
        return "I need an actual word or topic to look up online."

    wiktionary_base = "https://en.wiktionary.org/api/rest_v1/page/summary"
    wikipedia_base = "https://en.wikipedia.org/api/rest_v1/page/summary"

    wiktionary_result = _fetch_wikimedia_summary(wiktionary_base, word)
    wikipedia_result = _fetch_wikimedia_summary(wikipedia_base, word)

    if wikipedia_result:
        _, wp_extract = wikipedia_result
        if wp_extract.lower().startswith("may refer to") or "may refer to:" in wp_extract.lower():
            return define_word_nicely(word)

    if wiktionary_result:
        title, extract = wiktionary_result
        msg = f"### Online lookup (Wiktionary): **{title}**\n\n{extract}\n\n"
        if wikipedia_result:
            w_title, w_extract = wikipedia_result
            msg += (
                f"---\n\n### Extra context (Wikipedia): **{w_title}**\n\n{w_extract}\n\n"
            )
        msg += "I grabbed this from public online sources; if you want, I can help rephrase it or explain it further."
        return msg

    if wikipedia_result:
        title, extract = wikipedia_result
        return (
            f"### Online summary (Wikipedia): **{title}**\n\n{extract}\n\n"
            "I grabbed this from Wikipedia; I can also help explain it in simpler terms if you like."
        )

    return (
        f"I tried looking up **{word}** online, but couldn’t find a useful summary on Wiktionary or Wikipedia."
    )

def online_search_from_message(message: str) -> str:
    msg = message.strip()
    lower = msg.lower()
    for prefix in [
        "search online for",
        "search online",
        "look up",
        "google",
        "find",
        "search for",
    ]:
        if lower.startswith(prefix):
            core = msg[len(prefix):].strip()
            if core.lower().endswith("online"):
                core = core[: -len("online")].strip()
            if core.lower().startswith("the"):
                core = core[3:].strip()
            if core.lower().startswith("origin of"):
                core = core[len("origin of"):].strip()
            if core.lower().startswith("etymology of"):
                core = core[len("etymology of"):].strip()
            if not core:
                return "Tell me what you’d like me to look up online."
            topic = core.replace(" ", "_")
            return lookup_word_online(topic)

    topic = msg.replace(" ", "_")
    return lookup_word_online(topic)

## Routing & Guardrails

In [7]:
import re

RESTRICTED_TOPICS = [
    r"\bcat\b", r"\bcats\b",
    r"\bdog\b", r"\bdogs\b",
    r"\bhoroscope\b", r"\bzodiac\b",
    r"taylor swift",
]

SYSTEM_PROMPT_ACCESS = [
    r"system prompt",
    r"your prompt",
    r"your instructions",
    r"what are your rules",
    r"reveal.*system",
    r"ignore.*system",
    r"api key",
    r"openai key",
    r"secret key",
]

def is_restricted_topic(message: str) -> bool:
    msg = message.lower()
    return any(re.search(p, msg) for p in RESTRICTED_TOPICS)

def is_system_prompt_access(message: str) -> bool:
    msg = message.lower()
    return any(re.search(p, msg) for p in SYSTEM_PROMPT_ACCESS)

DICT_PATTERNS = [
    r"^\s*define\s+(?P<word>\w+)\s*\??\s*$",
    r"^\s*what\s+does\s+(?P<word>\w+)\s+mean\??\s*$",
    r"^\s*meaning\s+of\s+(?P<word>\w+)\s*\??\s*$",
]

WEB_PATTERNS = [
    r"^search online\b",
    r"^look up\b",
    r"^google\b",
    r"^find .* online\b",
]

def is_dictionary_query(message: str) -> bool:
    msg = message.lower().strip()
    return any(re.match(p, msg) for p in DICT_PATTERNS)

def extract_lookup_word(message: str) -> str:
    msg = message.lower().strip()
    for p in DICT_PATTERNS:
        m = re.match(p, msg)
        if m and "word" in m.groupdict():
            return m.group("word")
    tokens = re.findall(r"[a-zA-Z]+", msg)
    return tokens[-1] if tokens else ""

def is_web_query(message: str) -> bool:
    msg = message.lower()
    return any(re.search(p, msg) for p in WEB_PATTERNS)

def route_message(user_message: str, history: List[Tuple[str, str]]) -> str:
    if is_system_prompt_access(user_message):
        return "Sorry — I can’t share or modify my system instructions."
    if is_restricted_topic(user_message):
        return "Sorry — that topic is restricted and I can’t respond to questions about it."

    if is_dictionary_query(user_message):
        word = extract_lookup_word(user_message)
        return define_word_nicely(word)

    if is_web_query(user_message):
        return online_search_from_message(user_message)

    return answer_with_notes(user_message, history)

## Launch Gradio Chat App

In [8]:
import gradio as gr

def chat_logic(message, history):
    hist_tuples = [(h[0], h[1]) for h in history]
    reply = route_message(message, hist_tuples)
    return reply

def build_demo():
    return gr.ChatInterface(
        fn=chat_logic,
        title="WordSmith",
        description=(
            "WordSmith is a dictionary-style study assistant with three main skills:\n\n"
            "### 1️⃣ Dictionary lookup (Service 1)\n"
            "Ask me to define or explain an English word using an online dictionary API:\n"
            "- `define anomaly`\n"
            "- `what does futile mean?`\n"
            "- `meaning of heuristic`\n\n"
            "### 2️⃣ Questions answered from your notes (Service 2)\n"
            "I can answer conceptual questions by searching a small set of course notes that have been "
            "embedded into a ChromaDB vector store:\n"
            "- `what is overfitting?`\n"
            "- `explain cross-validation`\n"
            "- `summarize philosophy from my notes`\n\n"
            "If your notes don’t contain anything relevant, I’ll say so instead of guessing.\n\n"
            "### 3️⃣ Online lookups (Service 3)\n"
            "I can fetch short summaries from Wiktionary and Wikipedia for general topics or word origins:\n"
            "- `look up mitochondria`\n"
            "- `google etymology of serendipity`\n"
            "- `search online for the origin of melancholy`\n\n"
            "If you’re not sure what to do, just type a question or try one of the examples above!"
        ),
        theme="soft",
    )

demo = build_demo()
demo.launch()

  from .autonotebook import tqdm as notebook_tqdm
  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


