In [62]:
!pip install bs4
!pip install sentence-transformers faiss-cpu transformers langdetect deep-translator google-genai streamlit


Collecting streamlit
  Downloading streamlit-1.46.0-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting narwhals>=1.14.2 (from altair<6,>=4.0->streamlit)
  Downloading narwhals-1.44.0-py3-none-any.whl.metadata (11 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.12-py3-none

In [58]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import requests, html, json, tldextract
from deep_translator import GoogleTranslator, exceptions

# --- Paraphrasing with FLAN-T5 via Hugging Face Inference API ---
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the model once
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def flan_paraphrase(text):
    prompt = f"Paraphrase this in a friendly, natural tone: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=5,
        temperature=1.0,
        top_k=50,
        top_p=0.95
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



# --- Safe Translation with Deep Translator Fallback ---
def safe_translate(text, target='hi'):
    try:
        return GoogleTranslator(source='auto', target=target).translate(text)
    except exceptions.TranslationNotFound:
        print(f"⚠️ Could not translate: {text}")
        return text
    except Exception as e:
        print(f"❌ Translation error: {e}")
        return text

# --- CRAWLER + FAQ PARSER ---
BASE_URL = "https://jupiter.money"
visited = set()
faq_data = {}

def extract_topics_from_url(url):
    path = urlparse(url).path.lower()
    keywords = ["credit-card", "credit-cards", "payment", "payments", "account", "accounts", "upi", "bill", "bills",
                "magic-spends", "rewards", "loan", "loans", "money", "invest"]
    topics = []
    for kw in keywords:
        if (f"/{kw}" in path or f"-{kw}" in path or f"{kw}-" in path or path.endswith(kw) or path.endswith(f"{kw}/")):
            topic = kw.replace("-", " ").rstrip("s").title()
            if topic not in topics:
                topics.append(topic)
    return topics or ["General"]

def is_internal(url):
    base = tldextract.extract(BASE_URL).domain
    link = tldextract.extract(url).domain
    return base == link or url.startswith("/")

def get_all_links(soup, base_url):
    links = set()
    for a in soup.find_all('a', href=True):
        full = urljoin(base_url, a['href'])
        if is_internal(full):
            p = urlparse(full)
            links.add(f"{p.scheme}://{p.netloc}{p.path}")
    return links

def clean_text(text):
    return ' '.join(html.unescape(text).split())

def extract_faq_from_page(url):
    try:
        res = requests.get(url, timeout=10)
        if res.status_code != 200:
            return {}
        soup = BeautifulSoup(res.text, 'html.parser')
    except:
        return {}

    items = soup.select('.faq-item')
    if not items:
        return {}

    section = "FAQs"
    for tag in ['h1', 'h2', 'h3']:
        h = soup.find(tag, string=lambda t: t and 'faq' in t.lower())
        if h:
            section = clean_text(h.get_text())
            break

    topics = extract_topics_from_url(url)
    qas = []
    for item in items:
        q_tag = item.select_one('.faq-header span')
        a_tag = item.select_one('.faq-answer')
        if q_tag and a_tag:
            q_text = clean_text(q_tag.get_text())
            a_text = clean_text(a_tag.get_text(separator=' '))
            qas.append({
                "question": q_text,
                "answer": a_text,
                "topics": topics
            })

    return {section: qas}

def crawl(url, max_depth=2, depth=0):
    if depth > max_depth or url in visited:
        return
    visited.add(url)
    print(f"Crawling: {url}")
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
    except:
        return

    page_faqs = extract_faq_from_page(url)
    for section, qas in page_faqs.items():
        faq_data.setdefault(section, [])
        for qa in qas:
            if qa not in faq_data[section]:
                faq_data[section].append(qa)

    for link in get_all_links(soup, url):
        crawl(link, max_depth, depth + 1)

# Run crawler
crawl(BASE_URL)

# Translate + Rephrase
for section in faq_data.values():
    for qa in section:
        q_en, a_en = qa["question"], qa["answer"]
        qa["answer"] = flan_paraphrase(a_en)
        qa["question_hi"] = safe_translate(q_en, 'hi')
        qa["answer_hi"] = safe_translate(a_en, 'hi')

# Save to JSON
with open("jupiter_faqs_multilingual.json", "w", encoding='utf-8') as f:
    json.dump(faq_data, f, ensure_ascii=False, indent=2)

print("✅ Done: Crawled, paraphrased, and translated.")


Crawling: https://jupiter.money
Crawling: https://jupiter.money/pay-via-upi
Crawling: https://jupiter.money/calculators/personal-loan-emi-calculator/
Crawling: https://life.jupiter.money/
Crawling: https://jupiter.money/edge-csb-rupay-credit-card/
Crawling: https://jupiter.money/savings-account
Crawling: https://jupiter.money/corporate-salary-account
Crawling: https://jupiter.money/investments
Crawling: https://jupiter.money/calculators/credit-card-emi-calculator/
Crawling: https://jupiter.money/careers
Crawling: https://jupiter.money/magic-spends
Crawling: https://jupiter.money/
Crawling: https://jupiter.money/calculators/employee-provident-fund-calculator/
Crawling: https://jupiter.money/payments
Crawling: https://jupiter.money/google-api-disclosure
Crawling: https://jupiter.money/edge-visa-credit-card/
Crawling: https://jupiter.money/contact-us/
Crawling: https://jupiter.money/communication-guidelines
Crawling: https://jupiter.money/terms-and-conditions/
Crawling: https://jupiter.mo

The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Crawling: https://web.jupiter.money/rupay-csb/web-ob/landing


The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=

✅ Done: Crawled, paraphrased, and translated.
