In [None]:
!pip install chromadb sentence-transformers fastapi uvicorn pydantic PyPDF2

Collecting chromadb
  Downloading chromadb-1.2.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31

2

In [None]:
!pip -q install chromadb==0.5.5 sentence-transformers==3.0.1 fastapi==0.115.0 uvicorn[standard]==0.30.6 pyngrok==7.2.0 pydantic==2.9.2 scikit-learn==1.5.2
!pip -q uninstall -y fitz
!pip -q install pymupdf==1.24.9
from google.colab import drive
drive.mount('/content/drive')
PDF_PATH="/content/drive/MyDrive/JetBlue_Fare_Policy_Handbook_FULL.pdf"
PERSIST_DIR="/content/chroma_db"


[0mDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import fitz, re, uuid

FARE_FAMILIES=["Blue Basic","Blue","Blue Plus","Blue Extra","Blue Refundable","Mint"]
USER_TYPES=["TrueBlue","Guest","TrueBlue/Guest","Mosaic member","Mosaic","JetBlue Plus/Business Cardmember","JetBlue Business Cardmember","Cardmember"]
CATEGORY_HINTS={
    "boarding":["boarding","pre-board","group","queue","sequence","priority"],
    "cancellation":["cancel","cancellation","refund","nonrefundable","non-refundable","fee","waive","waiver","credit"],
    "changes":["change","changes","same-day","same day","standby","rebook","rebooking"],
    "baggage":["bag","baggage","checked","carry-on","personal item","oversize","overweight","dimension","linear"],
    "pet_travel":["pet","pets","in-cabin","kennel","carrier","service animal","emotional support"],
    "seating":["seat","seating","assignment","selection","even more space","ems","preferred"],
    "fare_rules":["fare","fare family","family benefits","inclusions","exclusions","rules","policy"],
    "fees":["fee","charge","penalty","cost","waive","waiver"],
    "airport":["airport","check-in","gate","security","tsa","boarding pass"]
}

def extract_text_blocks(path):
    doc=fitz.open(path)
    blocks=[]
    for page in doc:
        text=page.get_text("text")
        lines=[x.strip() for x in text.splitlines()]
        buf=[]
        for ln in lines:
            if not ln:
                if buf:
                    blocks.append(" ".join(buf)); buf=[]
            else:
                buf.append(ln)
        if buf: blocks.append(" ".join(buf))
    doc.close()
    return blocks

def sent_split(t):
    t=re.sub(r'\s+',' ',t).strip()
    s=re.split(r'(?<=[\.\!\?])\s+(?=[A-Z])',t)
    return [x.strip() for x in s if x.strip()]

def guess_fare_family(s):
    for f in sorted(FARE_FAMILIES,key=len,reverse=True):
        if re.search(rf'\b{re.escape(f)}\b',s,re.I): return f
    return "All"

def guess_user_type(s):
    for u in USER_TYPES:
        if re.search(rf'\b{re.escape(u)}\b',s,re.I):
            return "TrueBlue/Guest" if u in ["TrueBlue","Guest"] else u
    return "All"

def guess_category(s):
    low=s.lower()
    scores={k:sum(w in low for w in v) for k,v in CATEGORY_HINTS.items()}
    return max(scores,key=scores.get).replace("_"," ").title()

def make_policy_id(cat,fare,user):
    def slug(x): return re.sub(r'[^a-z0-9]+','-',x.lower()).strip('-') if x else "na"
    return f"jetblue.{slug(cat)}.{slug(fare)}.{slug(user)}.{uuid.uuid4().hex[:6]}"

def extract_records_from_pdf(path):
    blocks=extract_text_blocks(path)
    recs=[]
    for blk in blocks:
        sents=sent_split(blk)
        for s in sents:
            if len(s)<60:
                continue
            fare=guess_fare_family(s)
            user=guess_user_type(s)
            cat=guess_category(s)
            recs.append({
                "policy_id": make_policy_id(cat,fare,user),
                "category": cat,
                "fare_family": fare,
                "user_type": user,
                "effective_from": "Current",
                "source_url": "https://www.jetblue.com/flying-with-us/our-fares",
                "rule": s
            })
    return recs

recs=extract_records_from_pdf(PDF_PATH)
len(recs)


37

In [None]:
def norm(x):
    return (x or "").strip() if isinstance(x,str) else ("NA" if x is None else str(x))

def make_singleline_chunk(rec):
    pid=norm(rec.get("policy_id"))
    cat=norm(rec.get("category"))
    fam=norm(rec.get("fare_family"))
    utype=norm(rec.get("user_type"))
    eff=norm(rec.get("effective_from","Current"))
    surl=norm(rec.get("source_url"))
    rule=norm(rec.get("rule"))
    extras=[]
    for k,v in rec.items():
        kl=k.lower().strip()
        if kl in ["policy_id","category","fare_family","user_type","effective_from","source_url","rule"]:
            continue
        extras.append(f"[{k.replace('_',' ').title()}]: {norm(v)}")
    base=f"[Policy ID]: {pid} [Category]: {cat} [Fare Family]: {fam} [User Type]: {utype} [Effective From]: {eff} [Source URL]: {surl} [Rule]: {rule}"
    if extras: base+=" " + " ".join(extras)
    return " ".join(base.split())

chunks=[make_singleline_chunk(r) for r in recs]
len(chunks)


37

In [None]:
import os, shutil
if os.path.exists(PERSIST_DIR): shutil.rmtree(PERSIST_DIR)

from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

import chromadb
from chromadb.config import Settings
client = chromadb.PersistentClient(path=PERSIST_DIR, settings=Settings(allow_reset=True))
collection = client.get_or_create_collection(name="jetblue_policies")

def embed_texts(texts):
    return emb_model.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=True)

ids=[recs[i]["policy_id"] for i in range(len(recs))]
metas=[{"source":"JetBlue_Fare_Policy_Handbook_FULL.pdf","policy_id":recs[i]["policy_id"],"fare_family":recs[i]["fare_family"],"user_type":recs[i]["user_type"],"category":recs[i]["category"]} for i in range(len(recs))]
embs=[e.tolist() for e in embed_texts(chunks)]
collection.add(documents=chunks, metadatas=metas, ids=ids, embeddings=embs)
collection.count()


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


37

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

INTENT_HINTS={
    "book_flight":"fare family baggage fees boarding order change cancellation",
    "check_status":"status delay irregular operations rebooking standby disruptions",
    "pet_travel":"pet pets in cabin checked kennel carrier restrictions fees",
    "lost_baggage":"lost baggage mishandled delayed claims compensation timelines do not promise recovery timeline",
    "cancellation":"cancellation refund refundable nonrefundable fee waive waiver timeline policy blue basic",
    "seat_availability":"seat selection map assignment even more space preferred",
    "policy":"rules inclusions exclusions benefits fees",
    "boarding":"boarding group pre-board sequence priority",
    "baggage":"checked carry-on personal item size weight fees"
}

def build_query(user_intent: dict):
    base=user_intent.get("intent","").strip()
    parts=[base]
    for k,v in user_intent.items():
        if k=="intent": continue
        parts.append(str(v))
    parts.append(INTENT_HINTS.get(base,""))
    q=" ".join([p for p in parts if p]).strip()
    return q or "policy"

def retrieve_raw(user_intent, n=30):
    q=build_query(user_intent)
    q_emb=embed_texts([q])[0].tolist()
    res=collection.query(query_embeddings=[q_emb], n_results=n)
    docs=res.get("documents",[[]])[0]
    metas=res.get("metadatas",[[]])[0]
    ids=res.get("ids",[[]])[0]
    return docs, metas, ids, q

def tfidf_rerank(query, docs, top_k=5):
    vec=TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X=vec.fit_transform([query]+docs)
    qv=X[0]
    D=X[1:]
    scores=(D @ qv.T).toarray().ravel()
    order=np.argsort(-scores)[:top_k]
    return order, scores

def post_filter(user_intent, docs, metas):
    ff=str(user_intent.get("fare_family","")).lower()
    ut=str(user_intent.get("user_type","")).lower()
    cat=str(user_intent.get("intent","")).lower()
    pri=[]
    sec=[]
    for i,(d,m) in enumerate(zip(docs,metas)):
        ok1=ff and ff in d.lower()
        ok2=ut and ut in d.lower()
        ok3=cat and cat in d.lower()
        if ok1 or ok2 or ok3: pri.append(i)
        else: sec.append(i)
    return pri+sec

def retrieve_snippets(user_intent: dict, top_k=5):
    docs, metas, ids, q = retrieve_raw(user_intent, n=40)
    if not docs: return []
    order_pref = post_filter(user_intent, docs, metas)
    docs2=[docs[i] for i in order_pref]
    top=min(top_k,len(docs2))
    ord_idx,_=tfidf_rerank(q, docs2, top_k=top)
    selected=[docs2[i] for i in ord_idx]
    seen=set()
    out=[]
    for s in selected:
        pid=re.search(r'\[Policy ID\]:\s*([^\s\[]+)', s)
        key=pid.group(1) if pid else s[:60]
        if key in seen: continue
        seen.add(key)
        out.append(s)
        if len(out)>=top_k: break
    return out


In [None]:
import re, datetime, json
from typing import List, Dict

BASE_KEYS=["Policy ID","Category","Fare Family","Rule"]

def explode_snippet(s):
    parts=re.split(r'\s*(?=\[Policy ID\]:)', s.strip())
    return [p.strip() for p in parts if p.strip()]

def parse_line_to_dict(line):
    kv=re.findall(r'\[([^\]]+)\]:\s*([^\[]*)', line)
    d={}
    for k,v in kv:
        d[k.strip()]=v.strip()
    return d

def cat_norm(x):
    return (x or "").lower().strip()

INTENT_CATS={
    "boarding":["boarding","airport > boarding"],
    "cancellation":["cancellation","flexibility > cancellations","refund"],
    "changes":["changes","flexibility > changes","same-day","same day","standby"],
    "baggage":["baggage","checked bags","carry-on","fare rules > carry-on","personal item"],
    "pet_travel":["pet","pets","kennel","carrier","in-cabin"],
    "seat_availability":["seats","selection","even more space"],
    "policy":["fare rules","rules","policy"],
    "check_status":["irregular operations","status","disruption","delay"]
}

def matches_intent(d, intent):
    cat=cat_norm(d.get("Category",""))
    rule=cat_norm(d.get("Rule",""))
    keys=[k.lower() for k in INTENT_CATS.get(intent,["policy"])]
    if any(k in cat for k in keys): return True
    if any(k in rule for k in keys): return True
    return False

def matches_facets(d, user_intent, enforce):
    if not enforce: return True
    ff=user_intent.get("fare_family")
    ut=user_intent.get("user_type")
    ok=True
    if ff: ok = ok and (ff.lower() in d.get("Fare Family","").lower())
    if ut: ok = ok and (ut.lower() in d.get("User Type","").lower())
    return ok

def to_min_json(d):
    out={
        "policy_id": d.get("Policy ID","").strip(),
        "category": d.get("Category","").strip(),
        "rule": d.get("Rule","").strip()
    }
    ff=d.get("Fare Family","").strip()
    if ff: out["fare_family"]=ff
    return out

def retrieve_snippets_intent_only(user_intent: dict, top_k=5):
    only_intent = set(user_intent.keys()) == {"intent"}
    n = 200 if only_intent else 40
    raw=retrieve_snippets(user_intent, top_k=n)
    all_lines=[]
    for s in raw:
        all_lines.extend(explode_snippet(s))
    items=[parse_line_to_dict(x) for x in all_lines if x.strip()]
    intent=user_intent.get("intent","policy")
    items=[d for d in items if matches_intent(d,intent)]
    items=[d for d in items if matches_facets(d,user_intent,enforce=not only_intent)]
    seen=set()
    out=[]
    for d in items:
        pid=d.get("Policy ID","") or d.get("policy_id","")
        if pid in seen: continue
        seen.add(pid)
        out.append(to_min_json(d))
        if not only_intent and len(out)>=top_k: break
    return out

def retrieve_and_save_intent_only(user_intent: dict, top_k=5, out_path="/content/policy_snippets_intent.json"):
    snippets=retrieve_snippets_intent_only(user_intent, top_k=top_k)
    data={"timestamp":datetime.datetime.now().isoformat(),"user_intent":user_intent,"policy_snippets":snippets}
    with open(out_path,"w",encoding="utf-8") as f:
        json.dump(data,f,ensure_ascii=False,indent=2)
    return data


In [None]:
retrieve_and_save_intent_only(
     {"intent":"cancellation","fare_family":"Blue Basic","user_type":"TrueBlue/Guest"},
    top_k=5,
    out_path="/content/cancellation_all_min.json"
)




{'timestamp': '2025-10-22T15:06:33.117393',
 'user_intent': {'intent': 'cancellation',
  'fare_family': 'Blue Basic',
  'user_type': 'TrueBlue/Guest'},
 'policy_snippets': [{'policy_id': 'jetblue.trueblue.cancellations.blue-basic',
   'category': 'Flexibility > Cancellations',
   'rule': 'Cancellations permitted; a fee applies in addition to any fare difference.',
   'fare_family': 'Blue Basic'},
  {'policy_id': 'jetblue.cancellation.blue-basic.trueblue-guest.e505c0',
   'category': 'Cancellation',
   'rule': 'Refund usually as travel credit for nonIrefundable fares.',
   'fare_family': 'Blue Basic'},
  {'policy_id': 'jetblue.cancellation.blue-basic.trueblue-guest.f43cf4',
   'category': 'Cancellation',
   'rule': '',
   'fare_family': 'Blue Basic'},
  {'policy_id': 'jetblue.cancellation.blue-basic.trueblue-guest.19a732',
   'category': 'Cancellation',
   'rule': '',
   'fare_family': 'Blue Basic'}]}

In [None]:
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel, Field
from typing import Dict, List, Optional

# Define policy item format
class PolicyItem(BaseModel):
    policy_id: str
    category: str
    rule: str
    fare_family: Optional[str] = None

# Define the input format
class IntentIn(BaseModel):
    user_intent: Dict

# Define the output format (list of policy snippets)
class SnippetsOut(BaseModel):
    policy_snippets: List[PolicyItem] = Field(default_factory=list)

# Initialize FastAPI app
app = FastAPI(title="JetBlue Policy RAG (Minimal JSON)")

# Define the retrieve endpoint
@app.post("/retrieve", response_model=SnippetsOut)
def retrieve_endpoint(payload: IntentIn):
    # This is where your logic to retrieve policy snippets will be called
    res = retrieve_snippets_intent_only(payload.user_intent, top_k=5)
    return {"policy_snippets": res}

import nest_asyncio
import uvicorn

# Apply patch to allow uvicorn to run inside the notebook's event loop
nest_asyncio.apply()

# Now you can run uvicorn without getting the asyncio error
uvicorn.run(app, host="127.0.0.1", port=8000)



INFO:     Started server process [57757]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
