In [4]:
import os
import json
import re
import pandas as pd
from typing import List, Dict, Any, Optional

from dotenv import load_dotenv
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct
from rank_bm25 import BM25Okapi

# ────────────────────────────────────────────────────────────────
# Configure Gemini (Google Generative AI)
# ────────────────────────────────────────────────────────────────
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))


# ────────────────────────────────────────────────────────────────
# 1) Structured Vector Search Tool (normal experts)
# ────────────────────────────────────────────────────────────────
class StructuredVectorSearchTool:
    def __init__(
        self,
        collection_name: str = "norm_experts",
        qdrant_url: str = "http://localhost:6333",
        embedding_model: str = "all-MiniLM-L6-v2"
    ):
        self.model = SentenceTransformer(embedding_model)
        self.client = QdrantClient(url=qdrant_url)
        self.collection_name = collection_name

        if self.client.collection_exists(collection_name):
            self.client.delete_collection(collection_name)
        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=self.model.get_sentence_embedding_dimension(),
                distance=Distance.COSINE
            )
        )

    def _aggregate_text(self, doc: dict) -> str:
        parts: List[str] = []
        # bio, headline
        for fld in ("bio", "headline"):
            v = doc.get(fld, "")
            if isinstance(v, str) and v.strip():
                parts.append(v.strip())

        # geography details
        geo = doc.get("geography_details", [])
        if isinstance(geo, str):
            try:
                geo = json.loads(geo)
            except json.JSONDecodeError:
                geo = []
        if isinstance(geo, list):
            names = [g.get("name","") for g in geo if isinstance(g, dict)]
            if names:
                parts.append(", ".join(names))

        # expertise codes
        exp = doc.get("expertise_in_these_geographies", "")
        if isinstance(exp, str) and exp.strip():
            parts.append(exp.strip())

        # work experiences
        raw = doc.get("work_experiences", [])
        if isinstance(raw, str):
            try:
                raw = json.loads(raw)
            except json.JSONDecodeError:
                raw = []
        if isinstance(raw, list):
            for we in raw:
                if not isinstance(we, dict):
                    continue
                t = (we.get("designation") or "").strip()
                d = (we.get("job_description") or "").strip()
                if t or d:
                    parts.append(f"{t}: {d}")

        return "\n".join(parts)

    def add_documents(self, docs: pd.DataFrame | List[dict]):
        if isinstance(docs, pd.DataFrame):
            docs = docs.to_dict(orient="records")
        texts = [self._aggregate_text(d) for d in docs]
        embs = self.model.encode(texts, show_progress_bar=True)

        points: List[PointStruct] = []
        for d, v in zip(docs, embs):
            rid = int(d.get("id", 0))
            points.append(PointStruct(
                id=rid,
                vector=v.tolist(),
                payload=d
            ))
        self.client.upsert(
            collection_name=self.collection_name,
            points=points,
            wait=True
        )

    def search(self, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
        qv = self.model.encode([query])[0].tolist()
        hits = self.client.search(
            collection_name=self.collection_name,
            query_vector=qv,
            limit=top_k
        )
        results: List[Dict[str,Any]] = []
        for h in hits:
            p = h.payload
            results.append({
                "expert_id":    int(p.get("id", 0)),
                "expert_name":  p.get("expert_name", "") or p.get("name",""),
                "bio":          p.get("bio",""),
                "headline":     p.get("headline",""),
                "work_summary": "",  # no work_summary here
                "_score":       h.score
            })
        return results


# ────────────────────────────────────────────────────────────────
# 2) Structured Keyword Search Tool (normal experts)
# ────────────────────────────────────────────────────────────────
class StructuredKeywordSearchTool:
    def __init__(self, k1: float = 1.5, b: float = 0.75):
        self.k1 = k1
        self.b  = b
        self.docs: List[dict] = []
        self.bm25: Optional[BM25Okapi] = None

    def _tokenize(self, text: str) -> List[str]:
        return re.findall(r"\w+", text.lower())

    def _aggregate_text(self, doc: dict) -> str:
        return StructuredVectorSearchTool()._aggregate_text(doc)

    def add_documents(self, docs: pd.DataFrame | List[dict]):
        if isinstance(docs, pd.DataFrame):
            docs = docs.to_dict(orient="records")
        self.docs = docs
        corpus = [self._aggregate_text(d) for d in docs]
        toks = [self._tokenize(c) for c in corpus]
        self.bm25 = BM25Okapi(toks, k1=self.k1, b=self.b)

    def search(self, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
        if self.bm25 is None:
            raise RuntimeError("Index not built")
        qt = self._tokenize(query)
        scores = self.bm25.get_scores(qt)
        idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
        results: List[Dict[str,Any]] = []
        for i in idxs:
            d = self.docs[i]
            results.append({
                "expert_id":    int(d.get("id", 0)),
                "expert_name":  d.get("expert_name","") or d.get("name",""),
                "bio":          d.get("bio",""),
                "headline":     d.get("headline",""),
                "work_summary": "",
                "_score":       float(scores[i])
            })
        return results


# ────────────────────────────────────────────────────────────────
# 3) Extract project‐mapped Q&A docs
# ────────────────────────────────────────────────────────────────
def extract_agenda_docs(df: pd.DataFrame) -> List[dict]:
    out: List[dict] = []
    for _, row in df.iterrows():
        try:
            eid = int(row["expert_id"])
        except (KeyError, ValueError):
            continue
        bio     = row.get("expert_bio","") or ""
        head    = row.get("expert_headline","") or ""
        summary = row.get("expert_work_summary","") or ""
        raw     = row.get("project_agenda_responses","[]")
        try:
            arr = json.loads(raw)
        except json.JSONDecodeError:
            continue
        for idx, qa in enumerate(arr):
            q = (qa.get("question") or "").strip()
            a = (qa.get("answer")   or "").strip()
            txt = f"{q} {a}".strip()
            if not txt:
                continue
            doc_id = eid*1000 + idx
            out.append({
                "_id":                 doc_id,
                "expert_id":           eid,
                "expert_name":         row.get("expert_name","") or "",
                "expert_bio":          bio,
                "expert_headline":     head,
                "expert_work_summary": summary,
                "text":                txt
            })
    return out


# ────────────────────────────────────────────────────────────────
# 4) Agenda Vector Search Tool (project‐mapped experts)
# ────────────────────────────────────────────────────────────────
class AgendaVectorSearchTool:
    def __init__(
        self,
        collection_name: str = "agenda_responses",
        qdrant_url: str = "http://localhost:6333",
        embedding_model: str = "all-MiniLM-L6-v2"
    ):
        self.model = SentenceTransformer(embedding_model)
        self.client = QdrantClient(url=qdrant_url)
        self.collection_name = collection_name

        if self.client.collection_exists(collection_name):
            self.client.delete_collection(collection_name)
        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=self.model.get_sentence_embedding_dimension(),
                distance=Distance.COSINE
            )
        )

    def add_documents(self, docs: List[dict]):
        texts = [d["text"] for d in docs]
        embs  = self.model.encode(texts, show_progress_bar=True)
        points: List[PointStruct] = []
        for d, emb in zip(docs, embs):
            points.append(PointStruct(
                id=d["_id"],
                vector=emb.tolist(),
                payload={
                    "expert_id":    d["expert_id"],
                    "expert_name":  d["expert_name"],
                    "bio":          d["expert_bio"],
                    "headline":     d["expert_headline"],
                    "work_summary": d["expert_work_summary"]
                }
            ))
        self.client.upsert(
            collection_name=self.collection_name,
            points=points,
            wait=True
        )

    def search(self, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
        qv = self.model.encode([query])[0].tolist()
        hits = self.client.search(
            collection_name=self.collection_name,
            query_vector=qv,
            limit=top_k
        )
        return [
            {
                "expert_id":    h.payload["expert_id"],
                "expert_name":  h.payload["expert_name"],
                "bio":          h.payload["bio"],
                "headline":     h.payload["headline"],
                "work_summary": h.payload["work_summary"],
                "_score":       h.score
            }
            for h in hits
        ]


# ────────────────────────────────────────────────────────────────
# 5) Agenda Keyword Search Tool (project‐mapped experts)
# ────────────────────────────────────────────────────────────────
class AgendaKeywordSearchTool:
    def __init__(self, k1: float = 1.5, b: float = 0.75):
        self.k1 = k1
        self.b  = b
        self.docs: List[dict] = []
        self.bm25: Optional[BM25Okapi] = None

    def _tokenize(self, text: str) -> List[str]:
        return re.findall(r"\w+", text.lower())

    def add_documents(self, docs: List[dict]):
        self.docs = docs
        corpus = [d["text"] for d in docs]
        toks = [self._tokenize(c) for c in corpus]
        self.bm25 = BM25Okapi(toks, k1=self.k1, b=self.b)

    def search(self, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
        if not self.bm25:
            raise RuntimeError("Index not built")
        qt = self._tokenize(query)
        scores = self.bm25.get_scores(qt)
        idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
        return [
            {
                "expert_id":    self.docs[i]["expert_id"],
                "expert_name":  self.docs[i]["expert_name"],
                "bio":          self.docs[i]["expert_bio"],
                "headline":     self.docs[i]["expert_headline"],
                "work_summary": self.docs[i]["expert_work_summary"],
                "_score":       float(scores[i])
            }
            for i in idxs
        ]


# ────────────────────────────────────────────────────────────────
# 6) Fusion & Reranking (handles mixed id/expert_id)
# ────────────────────────────────────────────────────────────────
class AgendaResultsReranker:
    def __init__(self, alpha: float = 0.5):
        self.alpha = alpha

    def _get_eid(self, hit: Dict[str,Any]) -> int:
        if "expert_id" in hit:
            return hit["expert_id"]
        if "id" in hit:
            return hit["id"]
        raise KeyError(f"No id/expert_id in {hit}")

    def rerank_simple(
        self,
        vec_hits: List[Dict[str,Any]],
        kw_hits:  List[Dict[str,Any]],
        top_k:    int = 5
    ) -> List[Dict[str,Any]]:
        merged: Dict[int,Dict[str,Any]] = {}
        for h in vec_hits:
            eid = self._get_eid(h)
            rec = merged.setdefault(eid, {**h, "vec_score": h.get("_score",0), "kw_score":0})
            rec["vec_score"] = max(rec["vec_score"], h.get("_score",0))
        for h in kw_hits:
            eid = self._get_eid(h)
            rec = merged.setdefault(eid, {**h, "vec_score":0, "kw_score":h.get("_score",0)})
            rec["kw_score"] = max(rec["kw_score"], h.get("_score",0))
        records = list(merged.values())
        if not records:
            return []
        max_v = max(r["vec_score"] for r in records) or 1.0
        max_k = max(r["kw_score"]  for r in records) or 1.0
        for r in records:
            r["vec_norm"]    = r["vec_score"] / max_v
            r["kw_norm"]     = r["kw_score"]  / max_k
            r["fused_score"] = self.alpha * r["vec_norm"] + (1-self.alpha)*r["kw_norm"]
        records.sort(key=lambda r: r["fused_score"], reverse=True)
        return records[:top_k]


# ────────────────────────────────────────────────────────────────
# 7) Gemini Query Refiner
# ────────────────────────────────────────────────────────────────
class GeminiQueryRefiner:
    def __init__(self, model_name: str="gemini-1.5-flash", n_variants: int=3):
        self.model_name = model_name
        self.n_variants = n_variants

    def generate_variants(self, query: str, context: Optional[str]=None) -> List[str]:
        prompt = (
            (context + "\n\n") if context else ""
        ) + (
            f"Rewrite the user’s search query into {self.n_variants} concise paraphrases.\n"
            "Return ONLY a JSON array of strings.\n\n"
            f"User query: \"{query}\""
        )
        model = genai.GenerativeModel(self.model_name)
        resp = model.generate_content(prompt)
        text = resp.text.strip()
        try:
            arr = json.loads(text)
            return [v for v in arr if isinstance(v, str)][:self.n_variants]
        except json.JSONDecodeError:
            lines = [l.strip() for l in text.splitlines() if l.strip()]
            return lines[:self.n_variants]
    


# ────────────────────────────────────────────────────────────────
# 8) Human‐Like Search Agent
# ────────────────────────────────────────────────────────────────
class HumanLikeSearchAgent:
    def __init__(
        self,
        normal_vec: StructuredVectorSearchTool,
        normal_kw:  StructuredKeywordSearchTool,
        proj_vec:   AgendaVectorSearchTool,
        proj_kw:    AgendaKeywordSearchTool,
        reranker:   AgendaResultsReranker,
        refiner:    GeminiQueryRefiner,
        initial_k:  int = 10,
        final_n:    int = 5,
        quality_threshold: float = 0.5
    ):
        self.normal_vec        = normal_vec
        self.normal_kw         = normal_kw
        self.proj_vec          = proj_vec
        self.proj_kw           = proj_kw
        self.reranker          = reranker
        self.refiner           = refiner
        self.initial_k         = initial_k
        self.final_n           = final_n
        self.quality_threshold = quality_threshold
        self.history: List[Dict[str,Any]] = []

    def _normalize_ids(self, hits: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
        for h in hits:
            if "id" in h and "expert_id" not in h:
                h["expert_id"] = h.pop("id")
        return hits

    def _plan(self, q: str) -> Dict[str,Any]:
        prompt = f"""
You are an expert‐search assistant. The user says: "{q}".
Reply ONLY with a JSON object:
  • tools: array from ["normal_vec","normal_kw","proj_vec","proj_kw"]
  • refine: true/false
  • filters: e.g. {{"experience":5}} or {{}}
  • clarify: null or a question
Example:
{{"tools":["normal_vec","proj_vec"],"refine":false,"filters":{{}},"clarify":null}}
"""
        model = genai.GenerativeModel(self.refiner.model_name)
        resp = model.generate_content(prompt)
        txt = resp.text.strip()
        try:
            return json.loads(txt)
        except json.JSONDecodeError:
            return {
                "tools": ["normal_vec","normal_kw","proj_vec","proj_kw"],
                "refine": False,
                "filters": {},
                "clarify": None
            }


    def search(self, query: str) -> List[Dict[str,Any]]:
        plan = self._plan(query)
        # clarify
        if plan.get("clarify"):
            print("Clarify:", plan["clarify"])
            extra = input("Your answer: ").strip()
            query += " " + extra
            plan = self._plan(query)
        # refine
        queries = [query]
        if plan.get("refine"):
            context = "\n".join(f"Q: {h['query']}" for h in self.history[-3:])
            queries = self.refiner.generate_variants(query, context=context)
        # retrieve
        vec_hits, kw_hits = [], []
        for q in queries:
            if "normal_vec" in plan["tools"]:
                vec_hits += self.normal_vec.search(q, self.initial_k)
            if "normal_kw" in plan["tools"]:
                kw_hits  += self.normal_kw.search(q, self.initial_k)
            if "proj_vec" in plan["tools"]:
                vec_hits += self.proj_vec.search(q, self.initial_k)
            if "proj_kw" in plan["tools"]:
                kw_hits  += self.proj_kw.search(q, self.initial_k)
        # normalize ids for normal hits
        vec_hits = self._normalize_ids(vec_hits)
        kw_hits  = self._normalize_ids(kw_hits)
        # apply filters
        filt = plan.get("filters", {})
        if "experience" in filt:
            yrs = int(filt["experience"])
            def has(h):
                m = re.search(r"(\d+)\+? years", h["headline"].lower())
                return m and int(m.group(1)) >= yrs
            vec_hits = [h for h in vec_hits if has(h)]
            kw_hits  = [h for h in kw_hits  if has(h)]
        # fuse & rerank
        merged = self.reranker.rerank_simple(vec_hits, kw_hits, top_k=self.initial_k)
        top_n = merged[: self.final_n]
        self.history.append({"query": query, "plan": plan, "results": top_n})
        return top_n



if __name__ == "__main__":
    # build and index tools
    norm_vec = StructuredVectorSearchTool()
    norm_kw  = StructuredKeywordSearchTool()
    df_norm = pd.read_csv("experts_202505291522.csv", encoding="utf8")
    norm_vec.add_documents(df_norm)
    norm_kw.add_documents(df_norm)

    df_proj = pd.read_csv("project_expert_data.csv", encoding="latin1")
    docs = extract_agenda_docs(df_proj)
    proj_vec = AgendaVectorSearchTool()
    proj_kw  = AgendaKeywordSearchTool()
    proj_vec.add_documents(docs)
    proj_kw.add_documents(docs)

    reranker = AgendaResultsReranker(alpha=0.6)
    refiner  = GeminiQueryRefiner(n_variants=3)

    agent = HumanLikeSearchAgent(
        norm_vec, norm_kw, proj_vec, proj_kw,
        reranker, refiner,
        initial_k=10, final_n=5, quality_threshold=0.5
    )

    # REPL
    while True:
        q = input("\nYour query (or 'exit'): ").strip()
        if q.lower() in ("exit","quit"):
            break
        res = agent.search(q)
        if not res:
            print("→ No matches found.")
            continue
        print("\nTop Experts:")
        for e in res:
            print(f"• [{e['expert_id']}] {e['expert_name']}  score={e['fused_score']:.3f}")
            print("   Headline:", e["headline"])
            print("   Bio snippet:", e["bio"][:80], "…")
            print("   Work summary:", e["work_summary"][:80], "…")


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


🔍 Your query (or 'exit'):  Expert in business and pharmacy


  hits = self.client.search(
  hits = self.client.search(



Top Experts:
• [32504] Riku Heikki Rautsola  score=0.672
   Headline: CXO expert with 30 years of experience in the pharmaceutical CDMO domain in global markets
   Bio snippet: Dr. Riku has 30 years of experience in the pharma CDMO domain. He has rich exper …
   Work summary: Virtualis Ag CordenPharma International Corden Pharma Group LE&RN (Lymphatic Edu …
• [1376] Vipul Gupta  score=0.655
   Headline: Marketing expert with 28+ years of experience in Public Sector industry with exposure across India.
   Bio snippet: Mr. Vipul is a Marketing expert with 28+ years of experience in Public Sector in …
   Work summary: LG Life Sciences Promed Exports Shri Banarsidas Chandiwala S.S. Trust Society …
• [32069] Hardik Shah  score=0.600
   Headline: Procurement expert with 12 years of experience in the pharmaceutical industry in India and gloabl markets.
   Bio snippet: Mr. Hardik has 12 years of experience in the pharmaceutical industry. Looking af …
   Work summary: EMS Teva Pharmaceuticals 


🔍 Your query (or 'exit'):  OK addtionally i also want them to be specialized in medicnes too



Top Experts:
• [67995] Neesha K. Patel  score=0.658
   Headline: Strategic expert with 21+ years of experience in Healthcare Technology industry with exposure across USA.
   Bio snippet: Ms. Neesha is a Strategic expert with 21+ years of experience in Healthcare Tech …
   Work summary: Vitality Group Inc. athenahealth Siemens Healthineers Optum WellPoint Advisory B …
• [1376] Vipul Gupta  score=0.612
   Headline: Marketing expert with 28+ years of experience in Public Sector industry with exposure across India.
   Bio snippet: Mr. Vipul is a Marketing expert with 28+ years of experience in Public Sector in …
   Work summary: LG Life Sciences Promed Exports Shri Banarsidas Chandiwala S.S. Trust Society …
• [32504] Riku Heikki Rautsola  score=0.600
   Headline: CXO expert with 30 years of experience in the pharmaceutical CDMO domain in global markets
   Bio snippet: Dr. Riku has 30 years of experience in the pharma CDMO domain. He has rich exper …
   Work summary: Virtualis Ag CordenPha


 Your query (or 'exit'):  I want a expert for this project_agenda question - What are the key business and technology priorities for the enterprise



Top Experts:
• [67995] Neesha K. Patel  score=1.000
   Headline: Strategic expert with 21+ years of experience in Healthcare Technology industry with exposure across USA.
   Bio snippet: Ms. Neesha is a Strategic expert with 21+ years of experience in Healthcare Tech …
   Work summary: Vitality Group Inc. athenahealth Siemens Healthineers Optum WellPoint Advisory B …
• [55] Rahul Rao test test test test test test test test test test test test  score=0.503
   Headline: A CXO Level expert with 29+ years of experience in Food & Beverages and Financial Services industries with exposure across United Kingdom.
   Bio snippet: Mr. Rahul is a CXO Level expert with 29+ years of experience in Food & Beverages …
   Work summary: Lloyds Banking Group HSBC ABN AMRO Bank N.V. National Australia Bank ICICI Bank  …
• [635] Dhananjay Shinde  score=0.501
   Headline: Business Development expert with 30+ years of experience in Infrastructure industry with exposure across Indian region.
   Bio snippet: M


Your query (or 'exit'):  agenda_question - What are the critical business and tech goals that enterprises should prioritize?



Top Experts:
• [67995] Neesha K. Patel  score=0.856
   Headline: Strategic expert with 21+ years of experience in Healthcare Technology industry with exposure across USA.
   Bio snippet: Ms. Neesha is a Strategic expert with 21+ years of experience in Healthcare Tech …
   Work summary: Vitality Group Inc. athenahealth Siemens Healthineers Optum WellPoint Advisory B …
• [42306] Guilherme Oliveira   score=0.701
   Headline: 11+ years of experience in Procurement with exposure in Brazil
   Bio snippet: more than 11 years of experience in procurement and strategy with exposure in Br …
   Work summary: Log-In LogÃ­stica Intermodal S/A Louis Dreyfus Company Pvt Ltd …
• [55] Rahul Rao test test test test test test test test test test test test  score=0.663
   Headline: A CXO Level expert with 29+ years of experience in Food & Beverages and Financial Services industries with exposure across United Kingdom.
   Bio snippet: Mr. Rahul is a CXO Level expert with 29+ years of experience in Food & 

KeyboardInterrupt: Interrupted by user