# Planner → Search → Synthesis (Groq + Tavily)

This notebook makes Groq act as a **planner** first (understand intent, resolve dates like "yesterday", generate precise search queries),
then calls **Tavily** to search, optionally fetches **article bodies**, and finally uses Groq again to **write** a summarized answer with citations.

**Requirements (.env or env vars):**
- `TAVIL_API_KEY` (or `TAVILY_API_KEY`)
- `GROQ_API_KEY`
- (Optional) `USER_TZ` (defaults to `Asia/Colombo`)

## 1) (Optional) Install dependencies

In [None]:
# If running locally, uncomment:
# %pip install requests python-dotenv beautifulsoup4

## 2) Load environment, time helpers, HTTP utils

In [2]:
import os, json, requests, re
from typing import List, Dict, Tuple
from datetime import datetime, timedelta
try:
    from zoneinfo import ZoneInfo
except Exception:
    ZoneInfo = None

from dotenv import load_dotenv
load_dotenv()

USER_TZ = os.getenv("USER_TZ", "Asia/Colombo")

TAVIL_API_KEY = os.getenv("TAVIL_API_KEY", "") or os.getenv("TAVILY_API_KEY", "")
GROQ_API_KEY  = os.getenv("GROQ_API_KEY", "")
GROQ_MODEL_PLAN  = os.getenv("GROQ_MODEL_PLAN",  "llama-3.3-70b-versatile")
GROQ_MODEL_WRITE = os.getenv("GROQ_MODEL_WRITE", "llama-3.3-70b-versatile")

print("TAVIL_API_KEY present? ", bool(TAVIL_API_KEY))
print("GROQ_API_KEY present?  ", bool(GROQ_API_KEY))
print("Time zone:             ", USER_TZ)
print("Groq models (plan/write):", GROQ_MODEL_PLAN, "/", GROQ_MODEL_WRITE)

def pjson(obj):
    print(json.dumps(obj, indent=2, ensure_ascii=False))

def safe_post(url, **kwargs):
    try:
        r = requests.post(url, timeout=45, **kwargs)
        r.raise_for_status()
        return r
    except Exception as e:
        print(f"[POST ERROR] {url} -> {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(e.response.text)
        return None

def safe_get(url, **kwargs):
    try:
        r = requests.get(url, timeout=45, **kwargs)
        r.raise_for_status()
        return r
    except Exception as e:
        print(f"[GET ERROR] {url} -> {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(e.response.text)
        return None

def resolve_relative_date(text: str) -> str:
    lower = (text or "").lower()
    now = datetime.utcnow()
    if ZoneInfo:
        try:
            now = datetime.now(ZoneInfo(USER_TZ))
        except Exception:
            pass
    if "yesterday" in lower:
        d = now - timedelta(days=1)
    elif "today" in lower:
        d = now
    elif "tomorrow" in lower:
        d = now + timedelta(days=1)
    else:
        d = now
    return d.strftime("%Y-%m-%d")

TAVIL_API_KEY present?  True
GROQ_API_KEY present?   True
Time zone:              Asia/Colombo
Groq models (plan/write): llama-3.3-70b-versatile / llama-3.3-70b-versatile


In [13]:
# ===== Logging helpers =====
from textwrap import shorten
import json, re

def section(title: str, char="="):
    print("\n" + (char * 12) + f" {title} " + (char * 12))

def t(text: str, n=140):
    """Truncate one-liners cleanly for logs."""
    return shorten((text or "").replace("\n", " "), width=n, placeholder="…")

def pjson(obj):
    print(json.dumps(obj, indent=2, ensure_ascii=False))

def preview_block(text: str, n=800):
    print((text or "")[:n] + ("" if not text or len(text) <= n else " …"))

## 3) Tavily search

In [3]:
def tavily_search(query: str, max_results: int = 5) -> Dict:
    if not TAVIL_API_KEY:
        raise RuntimeError("Missing TAVIL_API_KEY (or TAVILY_API_KEY). Put it in your environment/.env.")
    url = "https://api.tavily.com/search"
    headers = {"Authorization": f"Bearer {TAVIL_API_KEY}"}
    payload = {"query": query, "max_results": max_results}
    r = safe_post(url, headers=headers, json=payload)
    return r.json() if r else {}

## 4) (Optional) Fetch article bodies for top links

In [4]:
from bs4 import BeautifulSoup

def fetch_article_text(url: str, max_chars: int = 2000) -> str:
    r = safe_get(url, headers={"User-Agent": "Mozilla/5.0"})
    if not r:
        return ""
    try:
        soup = BeautifulSoup(r.text, "html.parser")
        paras = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
        text = "\n".join(paras)
        text = re.sub(r"\s+", " ", text).strip()
        return text[:max_chars]
    except Exception:
        return ""

## 5) Planner step (Groq): understand intent → generate search queries

In [None]:
def groq_plan(user_question: str) -> Dict:
    if not GROQ_API_KEY:
        raise RuntimeError("Missing GROQ_API_KEY.")
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
    abs_date = resolve_relative_date(user_question)
    system = (
        "You are a planning agent for sports Q&A. "
        "Output STRICT JSON describing how to search before answering. "
        "Infer league/team if specified; otherwise keep generic. "
        "When the user says 'yesterday', use the absolute date provided. "
        "JSON keys: intent (str), date_local (YYYY-MM-DD), search_queries (string[]), top_k_results (int, 3-8), fetch_article_bodies (int, 0-3). "
        "Return JSON only."
    )
    user = f"USER_QUESTION: {user_question}\nABSOLUTE_DATE_LOCAL: {abs_date}"
    payload = {
        "model": GROQ_MODEL_PLAN,
        "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
        "temperature": 0.1,
        "response_format": {"type": "json_object"},
        "stream": False
    }
    r = safe_post(url, headers=headers, json=payload)
    if not r:
        return {"intent":"error","date_local": abs_date, "search_queries":[user_question], "top_k_results":5, "fetch_article_bodies":1}
    try:
        data = r.json()
        plan = json.loads(data["choices"][0]["message"]["content"])
        plan.setdefault("date_local", abs_date)
        plan.setdefault("search_queries", [user_question])
        plan.setdefault("top_k_results", 5)
        plan.setdefault("fetch_article_bodies", 1)
        return plan
    except Exception as e:
        return {"intent":"error","date_local": abs_date, "search_queries":[user_question], "top_k_results":5, "fetch_article_bodies":1}

## 6) Aggregate results and build context

In [None]:
def normalize_tavily(tav: Dict) -> List[Dict]:
    items = []
    if isinstance(tav, dict):
        for r in tav.get("results", []):
            items.append({
                "title": r.get("title"),
                "url": r.get("url"),
                "snippet": r.get("content") or "",
                "score": r.get("score")
            })
    uniq, seen = [], set()
    for it in items:
        u = it.get("url")
        if u and u not in seen:
            uniq.append(it)
            seen.add(u)
    return uniq

def build_context(items: list[dict], top_k: int = 5, bodies: dict[str,str] | None = None) -> str:
    bodies = bodies or {}
    blocks = []
    for r in items[:max(1, top_k)]:  # ensure at least 1 block if items exist
        title = r.get("title") or "(no title)"
        url   = r.get("url") or ""
        snip  = (r.get("snippet") or "").strip()
        body  = (bodies.get(url) or "").strip()
        text_part = body or snip or title
        part = f"{title}\n{text_part}\nSource: {url}" if url else f"{title}\n{text_part}"
        blocks.append(part)
    return "\n\n---\n\n".join(blocks)

## 7) Writer step (Groq): synthesize final answer with citations

In [7]:
def groq_write(user_question: str, context: str, temperature: float = 0.2) -> str:
    if not GROQ_API_KEY:
        raise RuntimeError("Missing GROQ_API_KEY.")
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
    system = (
        "You are a concise sports assistant. Answer ONLY with information from the provided context. "
        "Summarize clearly. Include a 'Sources' section listing URLs used. "
        "If context is insufficient, say so. Do NOT answer non-sports queries."
    )
    user = f"CONTEXT:\n{context}\n\nQUESTION:\n{user_question}"
    payload = {
        "model": GROQ_MODEL_WRITE,
        "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
        "temperature": temperature,
        "stream": False
    }
    r = safe_post(url, headers=headers, json=payload)
    if not r:
        return "[ERR] Groq write failed."
    try:
        data = r.json()
        return data["choices"][0]["message"]["content"]
    except Exception as e:
        return f"[ERR] {e}"

## 8) End-to-end: plan → search → (optional bodies) → write

In [14]:
def groq_plan_verbose(user_question: str) -> dict:
    plan = groq_plan(user_question)  # call your existing planner
    section("PLANNER OUTPUT")
    pjson(plan)
    return plan

In [15]:
def ask_planned(user_question: str) -> dict:
    plan = groq_plan_verbose(user_question)
    queries = plan.get("search_queries", [user_question])
    top_k = int(plan.get("top_k_results", 5))
    n_bodies = int(plan.get("fetch_article_bodies", 1))

    results = []
    for q in queries:
        t = tavily_search(q, max_results=max(5, top_k))
        results.extend(normalize_tavily(t))

    seen, uniq = set(), []
    for r in results:
        u = r.get("url")
        if u and u not in seen:
            uniq.append(r)
            seen.add(u)
    uniq.sort(key=lambda x: (x.get("score") or 0), reverse=True)

    bodies = {}
    for r in uniq[:max(0, n_bodies)]:
        u = r.get("url")
        bodies[u] = fetch_article_text(u)

    ctx = build_context(uniq, top_k=top_k, bodies=bodies)
    answer = groq_write(user_question, context=ctx)

    return {"plan": plan, "citations": uniq[:top_k], "answer": answer, "context_preview": ctx[:800]}

## 9) Try it

In [16]:
user_q = "Summarize last weeks's La Liga matches."
out = ask_planned(user_q)
print("PLAN:"); pjson(out["plan"])
print("\nANSWER:\n", out["answer"])
print("\nCITATIONS:")
for c in out["citations"]:
    print(f"- {c['title']} -> {c['url']}")
print("\n[Context preview]\n", out["context_preview"])

  now = datetime.utcnow()



{
  "intent": "summarize_matches",
  "date_local": "2025-10-07",
  "search_queries": [
    "La Liga matches last week",
    "La Liga week summary",
    "Spanish league matches October 2025"
  ],
  "top_k_results": 5,
  "fetch_article_bodies": 2
}
PLAN:
{
  "intent": "summarize_matches",
  "date_local": "2025-10-07",
  "search_queries": [
    "La Liga matches last week",
    "La Liga week summary",
    "Spanish league matches October 2025"
  ],
  "top_k_results": 5,
  "fetch_article_bodies": 2
}

ANSWER:
 Insufficient context to provide a summary of last week's La Liga matches. The provided context includes schedules and information for upcoming matches, but does not provide results or summaries of previous matches.

Sources:
- https://seatpick.com/spanish-la-liga-tickets
- https://www.fancode.com/football/tour/spanish-la-liga-season-2025-2026-18801700/matches
- https://www.facebook.com/groups/816119582575353/posts/1980570249463608/
- https://www.espn.com/soccer/story/_/id/46436132/oct

In [17]:
# ---- setup_tavily.py (or a notebook cell) ----
import os, json, requests
from textwrap import shorten
from dotenv import load_dotenv

load_dotenv()

# Prefer TAVIL_API_KEY but fallback to TAVILY_API_KEY
TAVIL_API_KEY = os.getenv("TAVIL_API_KEY", "") or os.getenv("TAVILY_API_KEY", "")

def pjson(obj):
    print(json.dumps(obj, indent=2, ensure_ascii=False))

def safe_post(url, **kwargs):
    try:
        r = requests.post(url, timeout=45, **kwargs)
        r.raise_for_status()
        return r
    except Exception as e:
        print(f"[POST ERROR] {url} -> {e}")
        if hasattr(e, "response") and e.response is not None:
            print(e.response.text)
        return None

def t(s, n=140):
    return shorten((s or "").replace("\n", " "), width=n, placeholder="…")

print("TAVIL_API_KEY present? ", bool(TAVIL_API_KEY))

TAVIL_API_KEY present?  True


In [18]:
# ---- raw_tavily_search.py ----
def tavily_search_raw(query: str, max_results: int = 5) -> dict:
    if not TAVIL_API_KEY:
        raise RuntimeError("Missing TAVIL_API_KEY (or TAVILY_API_KEY).")
    url = "https://api.tavily.com/search"
    headers = {"Authorization": f"Bearer {TAVIL_API_KEY}"}
    payload = {"query": query, "max_results": max_results}
    r = safe_post(url, headers=headers, json=payload)
    return r.json() if r else {}

# Example:
# raw = tavily_search_raw("today's La Liga matches", max_results=6)
# pjson(raw)

In [19]:
# ---- normalize_tavily.py ----
from typing import List, Dict

def normalize_tavily(raw: dict) -> List[Dict]:
    items = []
    if isinstance(raw, dict):
        # Keep Tavily's direct 'answer' if present (as a pseudo-item)
        if raw.get("answer"):
            items.append({
                "title": "Tavily Answer",
                "url": "",
                "snippet": raw["answer"],
                "score": 1.0
            })
        for r in raw.get("results", []):
            snippet = r.get("content") or r.get("raw_content") or (r.get("title") or "")
            items.append({
                "title": r.get("title"),
                "url": r.get("url"),
                "snippet": snippet,
                "score": r.get("score")
            })
    # de-dup by URL (allow the 'answer' item with empty URL)
    seen, uniq = set(), []
    for it in items:
        u = it.get("url") or f"ans://{id(it)}"
        if u not in seen:
            uniq.append(it); seen.add(u)
    return uniq

In [20]:
# ---- probe_query.py ----
def probe_query(query: str, max_results: int = 6):
    print("\n============ QUERY ============")
    print(query)
    raw = tavily_search_raw(query, max_results=max_results)

    # Raw overview
    results = (raw or {}).get("results") or []
    print(f"\nraw.results: {len(results)}")
    if raw.get("answer"):
        print("tavily.answer:", t(raw["answer"], 200))

    # Normalized view
    items = normalize_tavily(raw)
    print(f"normalized items: {len(items)}")

    # Show top 5
    for i, it in enumerate(items[:5], 1):
        print(f"\n[{i}] {t(it.get('title') or '(no title)')}")
        print("    ", t(it.get("snippet") or "", 200))
        print("    ", it.get("url"))
    return items

# Example:
# _ = probe_query("yesterday's La Liga results", max_results=8)

In [21]:
# ---- probe_many.py ----
queries = [
    "yesterday's La Liga matches",
    "Premier League results yesterday BBC Sport",
    "Serie A results 2025-10-13 Sky Sports"
]

for q in queries:
    probe_query(q, max_results=8)


yesterday's La Liga matches

raw.results: 8
normalized items: 8

[1] LaLiga Results Yesterday (Football Spain Matches) - SportyTrader
     There were no LaLiga Football matches yesterday. Here were the matches of the past days. LaLiga. Spain - LaLiga. End · Osasuna. Getafe. 2 1. End · Real Oviedo.
     https://www.sportytrader.com/en/results/football/spain/laliga-108/yesterday/

[2] Football Matches Yesterday | OneFootball
     Yesterday's Matches. Monday, 13 October 2025. FA Cup Logo: FA Cup. FA Cup ... La Liga 2. La Liga 2. Matchday 9. Icon: Córdoba. Córdoba1. Icon: Cultural Leonesa.
     https://onefootball.com/en/matches/yesterday

[3] Spanish La Liga - Scores & Fixtures - Football - BBC Sport
     Sevilla versus Mallorca kick off 13:00. SevillaSevilla ; Barcelona versus Girona kick off 15:15. BarcelonaBarcelona ; Villarreal versus Real Betis kick off 17:30.
     https://www.bbc.com/sport/football/spanish-la-liga/scores-fixtures

[4] Spanish La Liga Scores & Fixtures - Sky Sports
