In [12]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Dict, Any, List, Optional
from IPython.display import display_markdown
from pathlib import Path
import os, re, json, math, time
from dotenv import load_dotenv
from tavily import TavilyClient

# ---------- setup ----------
load_dotenv()
TAVILY = TavilyClient(os.getenv("TAVILY_API_KEY"))
_USE_TAVILY_ANSWER_FALLBACK = False
_MIN_SUMMARY_CHARS = 300

# ---------- text utils ----------
_STOP = {"the","a","an","and","or","if","with","to","of","in","on","for","by","from","as",
         "is","are","was","were","be","been","being","it","its","this","that","at","about",
         "we","you","they","he","she","their","our","us"}
_SENT_SPLIT = re.compile(r'(?<=[\.\?\!])\s+')

def _normalize(t:str)->str: return re.sub(r"\s+"," ",t or "").strip()
def _tok(t:str)->List[str]: return re.findall(r"[A-Za-z0-9']+",t.lower())

def _score_sents(sents:List[str])->Dict[int,float]:
    freq={}
    for s in sents:
        for w in _tok(s):
            if w in _STOP: continue
            freq[w]=freq.get(w,0)+1
    if not freq: return {i:0 for i in range(len(sents))}
    mx=max(freq.values())
    tf={w:(0.5+0.5*freq[w]/mx) for w in freq}
    out={}
    for i,s in enumerate(sents):
        sc=sum(tf.get(w,0) for w in _tok(s))
        L=len(s)
        out[i]=sc*(1.0 if 60<=L<=240 else 0.8)
    return out

def _mmr(sents: List[str], scores: Dict[int, float], k: int = 6, div: float = 0.7) -> List[int]:
    selected: List[int] = []
    candidates = set(range(len(sents)))

    def sim(a: str, b: str) -> float:
        A, B = set(_tok(a)), set(_tok(b))
        if not A or not B:
            return 0.0
        return len(A & B) / math.sqrt(len(A) * len(B))

    while candidates and len(selected) < k:
        best_i = None
        best_val = float("-inf")
        for i in candidates:
            relevance = scores.get(i, 0.0)
            redundancy = max((sim(sents[i], sents[j]) for j in selected), default=0.0)
            val = div * relevance - (1.0 - div) * redundancy
            if val > best_val:
                best_val = val
                best_i = i

        if best_i is None:          # safety: shouldn't happen, but avoid KeyError
            break
        selected.append(best_i)
        candidates.remove(best_i)

    return sorted(selected)


In [13]:
class AgentState(TypedDict, total=False):
    question: str
    raw_results: Dict[str,Any]
    articles: List[Dict[str,Any]]
    answer: str
    error: str


In [14]:
def search_web(state:AgentState)->AgentState:
    q=(state.get("question") or "").strip()
    if not q: return {"error":"Empty question."}
    try:
        res=TAVILY.search(
            query=q,max_results=8,
            include_answer=False,
            include_raw_content=True,
            search_depth="advanced"
        ) or {}
    except Exception as e:
        return {"error":f"Tavily search failed: {e}"}

    results=res.get("results") or []
    urls=[r["url"] for r in results if r.get("url") and not r.get("content")]
    extra={}
    if urls:
        try:
            ext=TAVILY.extract(urls=urls) or []
            extra={e.get("url"):(e.get("content") or "") for e in ext}
        except Exception: pass

    arts=[]
    for i,r in enumerate(results,1):
        t=r.get("title") or "Untitled"
        u=r.get("url") or ""
        c=_normalize(r.get("content") or extra.get(u) or r.get("snippet") or "")
        if not c: continue
        if len(c)>12000: c=c[:12000]+" ..."
        arts.append({"source_id":i,"title":t,"url":u,"content":c})

    if not arts: return {"error":"No extractable content from references."}
    return {"raw_results":res,"articles":arts}


In [15]:
import pandas as pd
def persist_articles(arts, base="data", stem="tavily_docs"):
    Path(base).mkdir(parents=True,exist_ok=True)
    jp=Path(base)/f"{stem}.json"
    xp=Path(base)/f"{stem}.xlsx"
    with open(jp,"w",encoding="utf-8") as f: json.dump(arts,f,indent=2,ensure_ascii=False)
    pd.DataFrame(arts)[["source_id","title","url","content"]].to_excel(xp,index=False)
    return str(jp),str(xp)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def _chunks(t,size=600,ov=120):
    t=re.sub(r"\s+"," ",t).strip();out=[];i=0
    while i<len(t):
        out.append(t[i:i+size]);i+=max(1,size-ov)
    return out

def build_index(arts):
    passages=[]
    for a in arts:
        for ch in _chunks(a["content"]):
            if len(ch)<200: continue
            passages.append({"text":ch,"source_id":a["source_id"],
                             "title":a["title"],"url":a["url"]})
    vec=TfidfVectorizer(ngram_range=(1,2),max_df=0.9,min_df=2,stop_words="english")
    mat=vec.fit_transform([p["text"] for p in passages])
    return vec,mat,passages

def retrieve_passages(q,vec,mat,passages,k=8):
    qv=vec.transform([q]);sims=cosine_similarity(qv,mat)[0]
    idx=sims.argsort()[::-1][:k]
    return [(passages[i],float(sims[i])) for i in idx]


In [17]:
def generate_answer(state:AgentState)->AgentState:
    if state.get("error"): return {"answer":f"**Error:** {state['error']}"}
    arts=state.get("articles") or []
    if not arts:
        res=state.get("raw_results") or {}
        rs=res.get("results") or []
        for i,r in enumerate(rs,1):
            if r.get("snippet"): arts.append(
                {"source_id":i,"title":r.get("title"),"url":r.get("url"),
                 "content":r.get("snippet")})
        if not arts: return {"answer":"No sources to process."}

    j,x=persist_articles(arts)
    try: vec,mat,passages=build_index(arts)
    except Exception as e: return {"answer":f"Index error: {e}"}

    hits=retrieve_passages(state.get("question",""),vec,mat,passages,k=10)

    pool=[]
    for p,_ in hits:
        for s in _SENT_SPLIT.split(_normalize(p["text"])):
            if 40<=len(s)<=400:
                pool.append((p["source_id"],s,p["title"],p["url"]))
    if not pool: return {"answer":"No usable sentences in retrieved chunks."}

    sents=[x[1] for x in pool]
    sc=_score_sents(sents); pick=_mmr(sents,sc,6,0.7)
    used={};lines=[]
    for i in pick:
        sid,s,ttl,url=pool[i];used[sid]=(ttl,url)
        lines.append(f"{s} [{sid}]")
    summ=" ".join(lines)

    srcs={a["source_id"]:(a["title"],a["url"]) for a in arts}
    src_txt="\n".join(f"{sid}. [{t}]({u})" for sid,(t,u) in srcs.items())
    ans=f"{summ}\n\n**Sources**\n{src_txt}\n\n_JSON_: `{j}` , _Excel_: `{x}`"
    return {"answer":ans}

In [20]:
def create_agent():
    g=StateGraph(AgentState)
    g.add_node("search",search_web)
    g.add_node("result",generate_answer)
    g.set_entry_point("search")
    g.add_edge("search","result")
    g.add_edge("result",END)
    return g.compile()

agent=create_agent()

# example
out=agent.invoke({"question":"tell me about the doctor arrested with explosive in faridabad"})
display_markdown(out["answer"],raw=True)

Shaheen is reportedly part of Al-Falah University and closely associated with Kashmiri doctor Muzammil Ganaie, alias Musaib, who was arrested after 2,900 kg of explosives and inflammable material were recovered from his two rented rooms in Faridabad. [3] [...] A Lucknow-based woman doctor, arrested in connection with a massive explosives haul in Faridabad near Delhi, was tasked with establishing the women's wing of the Pakistan-based terror group Jaish-e-Mohammed (JeM) in India, according to Delhi Police sources. [3] A Lucknow-based woman doctor, arrested in connection with a massive explosives haul in Faridabad near Delhi, was tasked with establishing the women's wing of the Pakistan-based terror group Jaish-e-Mohammed (JeM) in India, according to Delhi Police sources. [5] ### Description 21913 views Posted: 10 Nov 2025 A joint operation by the Jammu and Kashmir Police, the Intelligence Bureau, and the Faridabad Police has foiled what could have been a major terror attack near Delhi. [6] [...] Lucknow doctor Shaheena Shahid led Jaish-e-Mohammed's women's wing in India, according to sources She was arrested after the Faridabad explosives haul, and an assault rifle was found in her car Shaheena is linked to Al-Falah University and is an associate of the arrested Kashmiri doctor Muzammil Ganaie Did our AI summary help? [3] Shakeel and another Kashmiri doctor, Adeel Ahmad Rather, were among eight people who were arrested for allegedly being part of a "white-collar terror" module involving the Jaish-e-Mohammed and Ansar Ghazwat-ul-Hind and spanning Jammu and Kashmir, Haryana, and Uttar Pradesh. [1]

**Sources**
1. [2 Arrested, 50 Kg More Explosives Recovered In Faridabad ... - NDTV](https://www.ndtv.com/video/2-arrested-50-kg-more-explosives-recovered-in-faridabad-day-after-delhi-blast-1020620)
2. [Terror plot: Med prof's arrest leads to 2900kg explosives haul in ...](https://timesofindia.indiatimes.com/india/terror-plot-med-profs-arrest-leads-to-2900kg-explosives-haul-in-fbd/articleshow/125237164.cms)
3. [Arrested UP Doctor Was Tasked With Setting Up Jaish's ... - NDTV](https://www.ndtv.com/india-news/dr-shaheena-shahid-jamaat-ul-mominat-india-head-lucknow-doctor-arrested-in-faridabad-terror-module-was-head-of-jaish-e-mohammed-jem-women-wing-in-indi-9613556)
4. [J&K terror module busted: Two Kashmiri doctors linked to Jaish,](https://indianexpress.com/article/india/explosives-in-faridabad-doctors-house-transnational-terror-module-busted-j-k-police-probe-10356894/)
5. [Arrested UP Doctor Was Tasked With Setting Up Jaish's Women ...](https://www.ndtv.com/video/arrested-up-doctor-was-tasked-with-setting-up-jaish-s-women-wing-in-india-1020649)
6. [Faridabad terror plot foiled 350 kg of explosives and arms recovered ...](https://www.youtube.com/watch?v=owNkemS_sEY)
7. [J&K Doctor With Ammonium Nitrate Arrested Near Delhi](https://www.ndtv.com/india-news/350kg-ammonium-nitrate-explosive-found-with-j-k-doctor-in-faridabad-near-delhi-what-is-ammonium-nitrate-is-ammonium-nitrate-dangerous-9606804)
8. [Doctors of Doom Under Scanner: Investigating Links to Delhi Blast](https://www.instagram.com/p/DQ6YqoYExmy/)

_JSON_: `data\tavily_docs.json` , _Excel_: `data\tavily_docs.xlsx`