# Shoplite RAG: Colab Deployment
Self-contained notebook: installs deps, loads an open-source LLM (Llama 3.1 8B or fallback), builds FAISS index, serves Flask API, and exposes via ngrok.


In [1]:

# Cell 1: Install dependencies
!pip -q install --upgrade pip
!pip -q install transformers accelerate bitsandbytes sentence-transformers faiss-cpu flask pyngrok pyyaml


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m0.9/1.8 MB[0m [31m27.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
# --- Cell 2: Imports, Configuration, and Device Setup ---
import os, json, time, threading
from typing import List, Dict, Any
import numpy as np
import torch
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, GenerationConfig
from pyngrok import ngrok

# --- Configuration Dictionary (Centralized and Reusable) ---
RAG_CONFIG = {
    "CHUNK_SIZE": 400,
    "CHUNK_OVERLAP": 0,#we don't need here an overlap
    "EMBEDDING_MODEL": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "PREFERRED_LLM": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "FALLBACK_LLM": "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    "RETRIEVAL_K": 4,
    "MIN_SIMILARITY_THRESHOLD": 0.60, # Minimum score to consider retrieval successful
    "CONFIDENCE_MAP": {
        "High": 0.75,
        "Medium": 0.50,
        "Low": 0.0,
    }
}
# --------------------------------

# Determine the device for PyTorch and LLM
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [3]:
# Cell 3: Knowledge base data
KNOWLEDGE_BASE = [
  {
    "id": "doc1",
    "title": "Shoplite User Registration Process",
    "content": "To create a Shoplite account, users visit the registration page and provide their email address, password, and basic profile information. Email verification is required within 24 hours of registration. Users can choose between buyer accounts (free) or seller accounts (requires business verification). Passwords must meet complexity rules: minimum 12 characters, at least one number, one uppercase letter, and one symbol. Multi‑factor authentication (MFA) is optional but recommended; SMS and authenticator‑app methods are supported. Username handles are unique platform‑wide and can be changed once every 90 days. Profile completion unlocks perks such as saved searches and personalized deals. Suspicious sign‑ups trigger a cool‑down and manual review. Account recovery requires email access plus a backup code or a verified phone number. Session timeouts are 30 minutes for web and 7 days for trusted devices. For minors, guardian consent is required in regions where applicable. Enterprise teams can request a consolidated billing profile after verification."
  },
  {
    "id": "doc2",
    "title": "Product Search and Filtering",
    "content": "Shoplite’s search supports keyword, category, brand, and structured filters. Filters include price range with currency awareness, shipping speed, seller rating thresholds, stock availability, and attributes like color/size. Query understanding expands bare terms (e.g., “tee” to “t‑shirt”) and auto‑corrects common typos. Results default to “Relevance,” but users can sort by price, rating, or newest. Faceted filters update counts dynamically. Saved searches alert users when new items match criteria, using daily digests to reduce notification fatigue. On mobile, chip‑style filters keep context visible as users scroll. The search index updates within 60 seconds of inventory changes for top sellers and within 5 minutes for others. Banned terms and restricted items are filtered according to regional law. Search logs are anonymized for quality metrics like click‑through rate and add‑to‑cart rate. Developers can use the Search API with pagination and a maximum of 50 results per page to maintain latency."
  },
  {
    "id": "doc3",
    "title": "Shopping Cart and Checkout",
    "content": "The Shoplite cart accepts items from multiple sellers and auto‑groups them by fulfillment method. Users can adjust quantities, save items for later, and apply one promo code per order. Taxes are estimated by shipping address and updated at payment step once the full address is confirmed. Shipping options show expected delivery windows based on SLA and carrier scans. Stock is soft‑reserved for 15 minutes after starting checkout to reduce oversells; for flash sales, reservation compresses to 5 minutes. Address book supports multiple addresses with nickname labels. The checkout supports guest mode, but coupon redemption requires login. Fraud signals (device fingerprint, velocity checks) may require step‑up verification. Digital goods are delivered instantly with license keys stored in the user vault. For partial shipments, each sub‑order gets its own tracking number. Email confirmations include a tax invoice PDF when applicable. If payment fails, the cart is restored and the user sees actionable error hints."
  },
  {
    "id": "doc4",
    "title": "Payment Methods and Security",
    "content": "Shoplite supports major cards, Apple Pay/Google Pay, PayPal, and region‑specific wallets. PCI‑DSS Level 1 compliance is maintained; sensitive card data never touches Shoplite servers and is tokenized at the gateway. 3‑D Secure (2.0) is applied using risk‑based rules. Users can store payment tokens for one‑click checkout, protected by device binding and MFA. Refunds are processed to the original payment method; store credit is offered when card rails do not support immediate refunds. The platform monitors chargeback ratios per seller with tiered penalties. Payment webhooks implement idempotency keys to prevent double‑posting. Developers integrating the Payments API should sign requests with HMAC and rotate keys every 90 days. Security audits include quarterly ASV scans and annual penetration testing. Suspicious transactions trigger manual review queues and may delay shipment. For subscriptions, vaulted tokens are scoped per merchant and cannot be reused across sellers."
  },
  {
    "id": "doc5",
    "title": "Order Tracking and Delivery",
    "content": "Customers can track orders via the Orders page, email links, or the mobile app’s push notifications. Tracking status flows through stages: Confirmed, Packed, Shipped, Out for Delivery, Delivered, or Exception (e.g., address issue). Each package includes carrier, tracking ID, latest scan time, and an estimated delivery date. Delivery estimates consider cutoff times and weekends; expedited options show narrowed windows. Failed delivery attempts automatically schedule re‑delivery when supported by the carrier. If no scans are recorded within 72 hours after label creation, the system nudges the seller to ship or cancels automatically depending on policy. Pickup‑point deliveries require a government‑issued ID. Buyers can reschedule delivery or change the pickup point before Out for Delivery stage. International shipments include customs status where available. Proof of delivery may include a signature or geotagged photo in regions where permitted."
  },
  {
    "id": "doc6",
    "title": "Return and Refund Policies",
    "content": "Shoplite offers a 30‑day return window for most categories starting from the delivery date. Certain categories (perishables, intimate apparel, digital downloads) are final sale unless defective. Returns require a return authorization (RA) number, which buyers can request from the Orders page. Prepaid return labels are provided for domestic returns; international returns may require buyer‑paid shipping depending on policy. Items must be unused and in original packaging; serial‑numbered goods are verified upon receipt. Refunds are issued within 5 business days of warehouse confirmation. Exchanges are supported when inventory is available. Abuse prevention includes return rate monitoring and mis‑ship claims checks. For marketplace orders, sellers may set stricter policies, but not looser than Shoplite’s baseline. If the carrier marks a package as delivered but the buyer disputes, support follows a claims process requiring an affidavit and any available evidence."
  },
  {
    "id": "doc7",
    "title": "Product Reviews and Ratings",
    "content": "Reviews can be left only by verified purchasers within 90 days of delivery. Ratings range from 1 to 5 stars and include optional text, photos, or videos. Shoplite uses an anti‑spam filter and human moderation. Sellers may reply publicly but cannot remove negative reviews; they can flag reviews that violate guidelines. The default sort is “Most Helpful,” calculated from recency, upvotes, and reviewer credibility. Review aggregation displays per‑attribute scores (fit, material quality) when sellers provide product attributes. Incentivized reviews must be disclosed and are weighted lower. Users can follow reviewers to see future posts. Offensive content is hidden pending review. Developers can access aggregated ratings via the Reviews API with caching recommended for high‑traffic pages. Attempts to game ratings (vote brigading) are rate‑limited and audited."
  },
  {
    "id": "doc8",
    "title": "Seller Account Setup and Management",
    "content": "Sellers register via the Shoplite Seller Portal, providing business name, contact details, tax ID, and payout bank account. Business verification typically completes in 2–3 business days. KYC (Know Your Customer) checks include legal entity validation and beneficial‑owner screening. Once approved, sellers configure shipping templates, return addresses, and handling times. The dashboard presents order queues, cancellation rates, late shipment metrics, and buyer messages. Payouts are weekly by default, with daily payouts available for low‑risk sellers after 60 days. Policy violations (e.g., listing prohibited items) trigger warnings and potential suspension. Multi‑user access with roles allows staff accounts for operations, catalog, and support. Two‑factor authentication is enforced for admin roles. Sellers can set vacation mode to pause listings while preserving search rank. Bulk listing tools and a CSV importer help migrate catalogs."
  },
  {
    "id": "doc9",
    "title": "Inventory Management for Sellers",
    "content": "Inventory can be managed via web UI, CSV imports, or Inventory API. Sellers define SKUs, barcodes, and variant attributes. Real‑time stock decrements on checkout prevent oversells; back‑in‑stock notifications collect waitlist emails. Low‑stock thresholds trigger alerts. For FBS (Fulfilled by Shoplite) inventory, inbound shipments require box content information and ASN (advanced shipment notice). Cycle counts reconcile discrepancies between physical and system stock. For bundled products, virtual SKUs map to component SKUs to ensure availability math is consistent. Out‑of‑stock items can be hidden or shown with an expected restock date. Auto‑archive kicks in after 90 days of zero stock and zero views. Sellers can bulk‑update prices with minimum/maximum guardrails to prevent errors."
  },
  {
    "id": "doc10",
    "title": "Commission and Fee Structure",
    "content": "Shoplite charges a commission on each sale, varying by category from 5% to 15%, plus a flat $0.30 transaction fee. High‑risk categories may include additional risk surcharges. FBS storage fees are billed monthly by cubic foot; long‑term storage fees apply after 365 days. Advertising features (Sponsored Listings) bill per click with a second‑price auction model. Refund administration may incur a small fee to cover payment gateway costs. Sellers receive detailed monthly statements, downloadable as CSV. Disputed charges can be appealed within 30 days. Promotions funded by Shoplite do not affect the seller’s commission. Fee changes are announced with at least 30 days’ notice except in cases of legal or security urgency."
  },
  {
    "id": "doc11",
    "title": "Customer Support Procedures",
    "content": "Support is available via chat, email, and phone, with priority queues for Plus members. The triage system categorizes tickets into payments, delivery, returns, account access, and policy. First‑response SLA is 4 hours for chat/email during business hours. Agents use internal tools to view order history, message logs, and fraud flags; sensitive fields are masked. Escalations go to specialized teams with target resolution times. Refund exceptions require supervisor approval. Support avoids making policy exceptions that could set precedents. For safety and harassment issues, a dedicated Trust & Safety team handles reports. Agents follow structured prompts to ensure consistent messaging and must link to policy documents in customer responses. Customer satisfaction (CSAT) surveys follow each contact, and QA audits a sample of interactions weekly."
  },
  {
    "id": "doc12",
    "title": "Mobile App Features",
    "content": "The Shoplite mobile app offers personalized home feed, barcode scanning for price comparison, and offline carts that sync when connected. Push notifications cover price drops, back‑in‑stock, delivery updates, and abandoned carts. Biometric login simplifies authentication. In‑app chat connects buyers to sellers for product questions. The app uses a compact UI with bottom navigation: Home, Search, Cart, Orders, Profile. Native share sheets let users send product links. App updates are rolled out progressively to 10%, then 50%, then 100% to monitor crash rates. Background fetch updates order statuses even if the app is not foregrounded. Deep links open directly to product, cart, or order pages. Data usage is minimized with on‑device caching and compressed images."
  },
  {
    "id": "doc13",
    "title": "API Documentation for Developers",
    "content": "Developers can access Shoplite APIs for Search, Orders, Inventory, and Reviews. Authentication uses OAuth 2.0 with scopes per API. Rate limits are per client ID: 100 requests/minute for standard and 500 requests/minute for partners. Webhooks are available for order events (created, shipped, refunded) and are signed with HMAC‑SHA256. SDKs are provided in JavaScript and Python with examples for pagination and retries. Error responses follow RFC 7807 problem+json with machine‑readable codes. Sandbox and production endpoints are separate; never use production credentials in test. Idempotency keys are required for write operations to avoid duplicate effects. API changelogs document deprecations with a 90‑day sunset. Support forums are moderated by Shoplite staff."
  },
  {
    "id": "doc14",
    "title": "Security and Privacy Policies",
    "content": "Shoplite practices data minimization: collect only what is necessary, retain only as long as needed, and encrypt data at rest and in transit. Access controls follow least privilege and are reviewed quarterly. Audit logs are immutable and stored separately. Privacy choices include data export and account deletion within 30 days. Cookies respect Do Not Track where legally mandated. Third‑party processors undergo security review and contractual DPAs. Breach notifications follow legal timelines and include scope and mitigations. Children’s data is protected with heightened controls. Internal red‑team exercises test incident response. The platform supports bug bounty submissions with coordinated disclosure. For ML features, differential privacy may be applied to aggregate metrics."
  },
  {
    "id": "doc15",
    "title": "Promotional Codes and Discounts",
    "content": "Shoplite supports percent‑off, amount‑off, and free‑shipping codes. Promo codes can be limited by category, seller, minimum spend, or first‑order only. Stacking is disabled; the best eligible discount is applied automatically. Expired codes are rejected with reasons. Abuse prevention checks velocity (codes per user per day) and linkage across devices. Sellers can fund coupons for their own catalog. Seasonal campaigns use controlled start/end times with the cart showing a countdown. Price‑slash promotions display the reference price and the discounted price with clear labeling. Developers should note that promotions are evaluated at checkout; cart totals may change after address confirmation if taxes or shipping rates shift."
  }
]

In [27]:
# Cell 4: YAML prompts (inline -> dict)
PROMPTS = {
  "base_retrieval_prompt": {
    "role": "You are a helpful Shoplite customer service assistant.",
    "goal": "Provide accurate answers using only the provided Shoplite documentation.",
    "context_guidelines": [
      "Use only information from the provided document snippets",
      "Cite specific documents when possible",
      "Prefer precise, actionable steps over vague summaries"
    ],
    "response_format": "Answer: <text>\nSources: <titles>\nConfidence: <High|Medium|Low>"
  },
  "multi_doc_synthesis": {
    "role": "Expert support analyst who combines information from multiple Shoplite docs.",
    "goal": "Synthesize a coherent answer merging policy + procedure across documents.",
    "context_guidelines": [
      "Aggregate across at least two documents when the question spans topics",
      "Call out dependencies (e.g., requires login)",
      "Avoid speculation; ask targeted follow‑ups if needed"
    ],
    "response_format": "Answer: <text>\nSources: <titles>\nKey Requirements: <bullets>"
  },
  "no_context_refusal": {
    "role": "Careful assistant who refuses beyond provided context.",
    "goal": "Prevent hallucinations when retrieval returns no relevant snippets.",
    "context_guidelines": [
      "If top‑k retrieval has low similarity or is empty, refuse with guidance"
    ],
    "response_format": "Answer: <refusal>\nNext Steps: <guidance>\nSources: None"
  }
}

In [None]:
# --- Cell 5: LLM Loading and Prompt Utility ---
HF_TOKEN = input("Enter your Hugging Face token (press Enter to skip and use fallback): ").strip()

def load_llm(repo_id: str, token: str):
    """Loads the LLM and tokenizer with 4-bit quantization on CUDA if available."""
    try:
        tok = AutoTokenizer.from_pretrained(repo_id, use_auth_token=(token or None))

        # Consistent model configuration
        model_kwargs = {
            "device_map": "auto",
            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
            "use_auth_token": (token or None),
        }

        if torch.cuda.is_available():
            # BitsAndBytes (bnb) settings for 4-bit quantization
            model_kwargs.update({
                "load_in_4bit": True,
                "bnb_4bit_quant_type": "nf4",
                "bnb_4bit_use_double_quant": True,
                "bnb_4bit_compute_dtype": torch.float16,
            })

        model = AutoModelForCausalLM.from_pretrained(repo_id, **model_kwargs)
        return tok, model
    except Exception as e:
        print(f"Failed to load {repo_id}. This is expected if the token is missing/invalid or resources are scarce.")
        raise

# LLM Loading Logic with Fallback
try:
    tokenizer, llm = load_llm(RAG_CONFIG["PREFERRED_LLM"], HF_TOKEN)
    active_model = RAG_CONFIG["PREFERRED_LLM"]
except Exception:
    print(f"Attempting fallback to {RAG_CONFIG['FALLBACK_LLM']}")
    tokenizer, llm = load_llm(RAG_CONFIG["FALLBACK_LLM"], HF_TOKEN)
    active_model = RAG_CONFIG["FALLBACK_LLM"]

print("\nLoaded model:", active_model)


def create_rag_messages(prompt_cfg: Dict[str, Any], question: str, ctx_text: str) -> List[Dict[str, str]]:
    """Formats the RAG prompt into the Llama 3.1 chat template structure."""

    # 1. System Prompt: Role, Goal, and Guidelines
    system_prompt = f"{prompt_cfg['role']}\n\nGoal: {prompt_cfg['goal']}\n\nContext Guidelines:\n"
    system_prompt += "\n".join([f"- {g}" for g in prompt_cfg['context_guidelines']])

    # 2. User Prompt: Context, Question, and CRITICAL format instruction
    user_prompt = (
        f"DOCUMENTATION CONTEXT:\n{ctx_text}\n\n"
        f"CRITICAL: You MUST use the provided context to answer. If the context does not contain the answer, you must state you cannot answer based on the provided documents.\n\n"
        f"USER QUESTION: {question}\n\n"
        f"Your response MUST follow this structure:\n{prompt_cfg['response_format']}"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    return messages


In [6]:
# --- Cell 6: RAG Pipeline and Index Building ---

# --- RAG Utility Functions ---
def split_text_into_chunks(text: str, chunk_size: int, overlap: int) -> List[str]:
    """Splits text by words with overlap, preventing context breakage."""
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]

# def embed_chunks(chunks: List[str]) -> tuple[np.ndarray, SentenceTransformer]:
#     """Encodes chunks and loads the embedder model."""
#     embedder = SentenceTransformer(RAG_CONFIG["EMBEDDING_MODEL"])
#     embeddings = embedder.encode(chunks, show_progress_bar=True)
#     return np.array(embeddings).astype('float32'), embedder

# def build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
#     """Creates a FAISS IndexFlatIP (Inner Product) after L2 normalization (cosine similarity)."""
#     dim = embeddings.shape[1]
#     index = faiss.IndexFlatIP(dim)
#     faiss.normalize_L2(embeddings)  # Normalize for Cosine Similarity
#     index.add(embeddings)
#     return index
def embed_chunks(chunks: List[str]) -> tuple[np.ndarray, SentenceTransformer]:
    """Encodes chunks and loads the embedder model with cosine normalization."""
    embedder = SentenceTransformer(RAG_CONFIG["EMBEDDING_MODEL"])
    # Normalize at encode time for cosine similarity
    embeddings = embedder.encode(chunks, show_progress_bar=True, normalize_embeddings=True)
    return np.array(embeddings).astype('float32'), embedder

def build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    """Creates a FAISS IndexFlatIP for cosine similarity."""
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # inner product
    index.add(embeddings)  # embeddings already normalized
    return index


def prepare_documents_for_embedding(documents: List[Dict[str, str]]):
    """Chunks documents and stores metadata."""
    all_chunks, chunked_doc_ids, chunked_doc_titles = [], [], []

    for doc in documents:
        # Include title in the chunk content for better embeddings
        doc_text = f"Title: {doc['title']}\n\n{doc['content']}"
        chunks = split_text_into_chunks(doc_text, RAG_CONFIG["CHUNK_SIZE"], RAG_CONFIG["CHUNK_OVERLAP"])

        all_chunks.extend(chunks)
        chunked_doc_ids.extend([doc['id']] * len(chunks))
        chunked_doc_titles.extend([doc['title']] * len(chunks))

    return all_chunks, chunked_doc_ids, chunked_doc_titles

# def retrieve(query: str, k: int = RAG_CONFIG["RETRIEVAL_K"]):
#     """Retrieves the top-k most relevant document chunks based on the query."""
#     # Global variables are available after the cell runs
#     query_embedding = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

#     sims, idxs = index.search(query_embedding, k)

#     results = []
#     for j in range(k):
#         i = idxs[0][j]
#         results.append({
#             "title": chunked_doc_titles[i],
#             "id": chunked_doc_ids[i],
#             "score": float(sims[0][j]),
#             "content": all_chunks[i]
#         })

#     return results
def retrieve(query: str, k: int = RAG_CONFIG["RETRIEVAL_K"]):
    """Retrieves the top-k most relevant document chunks based on cosine similarity."""
    # Normalize at encode time
    query_embedding = embedder.encode([query], normalize_embeddings=True).astype("float32")

    sims, idxs = index.search(query_embedding, k)

    results = []
    for j in range(k):
        i = idxs[0][j]
        results.append({
            "title": chunked_doc_titles[i],
            "id": chunked_doc_ids[i],
            "score": float(sims[0][j]),
            "content": all_chunks[i]
        })

    return results

# --- Index Initialization (Executed only once) ---
print("Preparing documents and building index...")
all_chunks, chunked_doc_ids, chunked_doc_titles = prepare_documents_for_embedding(KNOWLEDGE_BASE)
embeddings, embedder = embed_chunks(all_chunks)
index = build_faiss_index(embeddings)
print(f"FAISS Index built successfully with {len(all_chunks)} chunks using {RAG_CONFIG['EMBEDDING_MODEL']}.")

Preparing documents and building index...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS Index built successfully with 15 chunks using sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.


In [28]:
# --- Cell 7: Generation Function ---

# def format_context(ctx: List[Dict[str, Any]], max_chars: int = 1800) -> str:
#     """Formats retrieved context snippets into a string, respecting max_chars."""
#     blocks, total = [], 0
#     for c in ctx:
#         # Add source title to the snippet
#         snippet = f"[{c['title']}]\n{c['content']}"

#         # Context throttling
#         if total + len(snippet) > max_chars:
#             remaining_chars = max(0, max_chars - total)
#             snippet = snippet[:remaining_chars]
#             if remaining_chars > 0:
#                  blocks.append(snippet + "...") # Indicate truncation

#         if total < max_chars:
#             blocks.append(snippet)

#         total += len(snippet)
#         if total >= max_chars: break

#     return "\n\n".join(blocks)

def get_confidence(max_score: float) -> str:
    """Maps the max similarity score to a confidence level."""
    if max_score >= RAG_CONFIG["CONFIDENCE_MAP"]["High"]:
        return "High"
    elif max_score >= RAG_CONFIG["CONFIDENCE_MAP"]["Medium"]:
        return "Medium"
    else:
        return "Low"

def generate_answer(prompt_cfg: Dict[str, Any], question: str, ctx: List[Dict[str, Any]]) -> tuple[str, List[str]]:
    """Generates an answer using the LLM and the Llama 3.1 chat template."""

    # # # 1. Format Context and Messages
    # ctx_text = format_context(ctx)
    messages = create_rag_messages(prompt_cfg, question, ctx)

    # 2. Apply Chat Template and Tokenize
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # Appends the start of the assistant's response
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(llm.device)

    # 3. Define Generation Config
    gen_config = GenerationConfig(
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        # Stop generation at the Llama 3 end-of-turn token
        stop_strings=['<|eot_id|>'],
    )

    # 4. Generate
    with torch.no_grad():
        output = llm.generate(
            **inputs,
            generation_config=gen_config,
            tokenizer=tokenizer # Pass the tokenizer here
        )

    # 5. Decode and Extract Completion
    prompt_len = inputs['input_ids'].shape[1]
    completion_tokens = output[0][prompt_len:]
    completion = tokenizer.decode(completion_tokens, skip_special_tokens=True).strip()

    # Collect unique source titles
    titles = list(set(c["title"] for c in ctx))
    return completion, titles

In [8]:
pip install flask-cors


Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-6.0.1


In [34]:
from flask import Flask, request, jsonify
from flask_cors import CORS  # Import CORS

app = Flask(__name__)

# Enable CORS for all routes (global CORS)
CORS(app)


@app.route("/health")
def health():
    """Returns a simple status check and the active LLM name."""
    # Use the global active_model variable defined in Cell 5
    return jsonify(status="ok", model=active_model)

import logging

# Configure logging to show debug messages
logging.basicConfig(level=logging.DEBUG)

@app.route('/ping', methods=['POST'])
def ping():
    """Simple raw LLM response endpoint for basic connectivity/latency test."""
    try:
        data = request.get_json(force=True)
        prompt = data.get("prompt", "Hello from Shoplite RAG!")
        logging.debug(f"Received prompt: {prompt}")

        inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
        with torch.no_grad():
            out = llm.generate(**inputs, max_new_tokens=64, do_sample=True, top_p=0.9, temperature=0.7, pad_token_id=tokenizer.eos_token_id)

        prompt_len = inputs['input_ids'].shape[1]
        completion = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True).strip()
        logging.debug(f"Generated completion: {completion}")

        return jsonify(output=completion)

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return jsonify(error=f"An error occurred: {e}"), 500



@app.route("/chat", methods=["POST"])
def chat():
    """RAG-enabled chat endpoint. Retrieves context, generates answer, and applies refusal logic."""
    data = request.get_json(force=True)
    question = data.get("question", "").strip()

    if not question:
        return jsonify(error="question is required"), 400

    # 1. Retrieval
    ctx = retrieve(question, k=RAG_CONFIG["RETRIEVAL_K"])
    max_score = max([c["score"] for c in ctx])
    print("Retrieved context:", ctx) # Debug print
    print("Max similarity score:", max_score) # Debug print
    print("not ctx:",(not ctx))

    # 2. Refusal Logic (Low Similarity/No Context)
    if not ctx or max_score < RAG_CONFIG["MIN_SIMILARITY_THRESHOLD"]:
        # Use the specialized refusal prompt template
        cfg = PROMPTS["no_context_refusal"]

        # Craft a polite refusal message to pass to the LLM (for templating)
        refusal_q = "The context is insufficient to answer the user's question. Formulate a polite refusal."

        refusal_answer, _ = generate_answer(cfg, refusal_q, ctx)

        # Include ctx in the response even for low confidence
        return jsonify(answer=refusal_answer, sources=[], confidence="Low", ctx=ctx), 200

    # 3. Generation
    cfg = PROMPTS["base_retrieval_prompt"]
    answer, titles = generate_answer(cfg, question, ctx)

    # 4. Confidence Mapping
    confidence = get_confidence(max_score)

    return jsonify(answer=answer, sources=titles, confidence=confidence,ctx=ctx)


def run_app():
    """Function to run the Flask application in the main thread."""
    # Disable reloader in a threaded environment to prevent double execution
    app.run(debug=True, port=8000, use_reloader=True, reloader_type='stat')

In [35]:

from pyngrok import ngrok
import threading

# Start Flask in background
threading.Thread(target=run_app, daemon=True).start()

 * Serving Flask app '__main__'
 * Debug mode: on


Address already in use
Port 8000 is in use by another program. Either identify and stop that program, or start the server with a different port.


In [24]:


# Cell 9: ngrok tunnel
ngrok_token = input("Enter your ngrok token: ").strip()
if ngrok_token:
    ngrok.set_auth_token(ngrok_token)

public_url = ngrok.connect(8000, "http").public_url
print("Public URL:", public_url)



 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8000
 * Running on http://172.28.0.12:8000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Enter your ngrok token:  33TIy09MwZggQukR0il7WCOfH6Q_2mNTnjGok5impmeUFdSxG
Public URL: https://94c777f0af3e.ngrok-free.app


In [25]:
# Test /ping endpoint
import requests
try:
    ping_payload = {"prompt": "hi"}
    r = requests.post(public_url + "/ping", json=ping_payload)
    print("POST /ping:", r.json())
except Exception as e:
    print("POST /ping failed:", e)

INFO:werkzeug:127.0.0.1 - - [03/Oct/2025 02:25:56] "POST /ping HTTP/1.1" 200 -


POST /ping: {'output': "everyone, i'm so glad you could join me today. i want to talk about a very important topic that affects us all, regardless of where we come from or what our background is. that topic is mental health.\nmental health is not just about feeling sad or blue, it's about how we take care of our"}


In [13]:
# Test /chat endpoint
import requests
try:
    chat_payload = {"question": "How do I create a seller account on Shoplite?"}
    r = requests.post(public_url + "/chat", json=chat_payload)
    print("POST /chat:", r.json())
except Exception as e:
    print("POST /chat failed:", e)



POST /chat failed: Expecting value: line 1 column 1 (char 0)


In [None]:
# Test /health endpoint
import requests
try:
    r = requests.get(public_url + "/health")
    print("GET /health:", r.json())
except Exception as e:
    print("GET /health failed:", e)

INFO:werkzeug:127.0.0.1 - - [02/Oct/2025 13:36:01] "GET /health HTTP/1.1" 200 -


GET /health: {'model': 'unsloth/Meta-Llama-3.1-8B-Instruct', 'status': 'ok'}


In [None]:

# Cell 10: Quick test
import requests, time
time.sleep(2)
try:
    r = requests.get(public_url + "/health")
    print("Health:", r.json())
except Exception as e:
    print("Health check failed:", e)


INFO:werkzeug:127.0.0.1 - - [02/Oct/2025 13:28:22] "GET /health HTTP/1.1" 200 -


Health: {'model': 'unsloth/Meta-Llama-3.1-8B-Instruct', 'status': 'ok'}


In [43]:
import torch

def test_retrieval_and_generation(prompt_cfg: Dict[str, Any], question: str):
    """
    Test the retrieval and generation process by logging the retrieved context and
    checking the similarity scores before generating an answer.
    """

    # Step 1: Retrieve context based on the question
    ctx = retrieve(question, k=4)  # Assuming `retrieve()` function is already defined

    # Step 2: Log the retrieved context
    print("Retrieved Context:")
    for c in ctx:
        print(f"Title: {c['title']}\nScore: {c['score']}\nContent: {c['content'][:200]}...")  # Print a preview of the content

    # Step 3: Check if context has enough relevance
    scores = [c["score"] for c in ctx]
    print("Similarity Scores:", scores)

    # Optional: Check if context passes the threshold
    SIM_THRESHOLD = 0.6# Adjust the threshold value as needed
    if max(scores) < SIM_THRESHOLD:
       # 2. Refusal Logic (Low Similarity/No Context)

        # Use the specialized refusal prompt template
        prompt_cfg = PROMPTS["no_context_refusal"]

    # # Step 4: Format the context and messages
    # ctx_text = format_context(ctx)
    messages = create_rag_messages(prompt_cfg, question, ctx)

    # Step 5: Apply the chat template and tokenize the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(llm.device)

    # Step 6: Define generation config
    gen_config = GenerationConfig(
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        stop_strings=['<|eot_id|>'],
    )

    # Step 7: Generate the answer
    with torch.no_grad():
        output = llm.generate(
            **inputs,
            generation_config=gen_config,
            tokenizer=tokenizer  # Ensure tokenizer is passed here
        )

    # Step 8: Decode and extract the completion
    prompt_len = inputs['input_ids'].shape[1]
    completion_tokens = output[0][prompt_len:]
    completion = tokenizer.decode(completion_tokens, skip_special_tokens=True).strip()

    # Step 9: Collect unique source titles
    titles = list(set(c["title"] for c in ctx))

    # Step 10: Return the generated answer along with the relevant source titles
    return completion, titles

# Test the function with a sample question
sample_question = "What is the commission fee structure ?"
prompt_cfg = {
    "role": "You are a helpful Shoplite customer service assistant.",
    "goal": "Provide accurate answers using only the provided Shoplite documentation.",
    "context_guidelines": ["Use only information from the provided document snippets"],
    "response_format": "Answer: <text>\nSources: <titles>\nConfidence: <High|Medium|Low>",
}

# Call the test function
answer, sources = test_retrieval_and_generation(prompt_cfg, sample_question)

# Output the result
print("\nGenerated Answer:\n", answer)



Retrieved Context:
Title: Commission and Fee Structure
Score: 0.6679459810256958
Content: Title: Commission and Fee Structure Shoplite charges a commission on each sale, varying by category from 5% to 15%, plus a flat $0.30 transaction fee. High‑risk categories may include additional risk ...
Title: Payment Methods and Security
Score: 0.3782304525375366
Content: Title: Payment Methods and Security Shoplite supports major cards, Apple Pay/Google Pay, PayPal, and region‑specific wallets. PCI‑DSS Level 1 compliance is maintained; sensitive card data never touche...
Title: Return and Refund Policies
Score: 0.320715069770813
Content: Title: Return and Refund Policies Shoplite offers a 30‑day return window for most categories starting from the delivery date. Certain categories (perishables, intimate apparel, digital downloads) are ...
Title: Seller Account Setup and Management
Score: 0.31586652994155884
Content: Title: Seller Account Setup and Management Sellers register via the Shoplite Sel