# Building a Human-Friendly FAQ Bot from Jupiter Help Centre

## Jupiter official site scraping and extracting the FAQs

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin, urlparse
from collections import defaultdict


BASE_URL = "https://jupiter.money"
visited = set()

# Step 1: Get all internal links from the homepage
def get_internal_links(url):
    links = set()
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        for a in soup.find_all("a", href=True):
            full_url = urljoin(BASE_URL, a['href'])
            if BASE_URL in full_url and urlparse(full_url).path not in visited:
                links.add(full_url)
    except Exception as e:
        print(f"[Error] Failed to get links from {url}: {e}")
    return links

# Step 2: Check if the page has the correct h1
def is_faq_page(soup):
    h1 = soup.find_all("h1")
    h2 = soup.find_all("h2")
    strong = soup.find_all("strong")
    
    lst1 = [h and "frequently asked questions" in h.text.strip().lower() for h in h1]
    lst2 = [h and "frequently asked questions" in h.text.strip().lower() for h in h2]
    lst3 = [s and "FAQs" in s.text.strip() for s in strong]
    # return h1 and "frequently asked questions" in h1.text.strip().lower()
    if (True in lst1) or (True in lst2) or (True in lst3):
        return True
    else:
        return False

def get_category_name_from_url(url):
    path = urlparse(url).path.strip('/')
    if '/' in path:
        # pick last segment if nested
        return path.split('/')[-1]
    return path or "home"
    

def extract_faqs(soup):
    faqs = []

    # --- Type 1: Collapsible FAQs ---
    questions = soup.find_all("div", class_="faq-header")
    answers = soup.find_all("div", class_="faq-answer")
    for q, a in zip(questions, answers):
        q_text = q.get_text(strip=True)
        a_text = a.get_text(strip=True)
        if q_text and a_text:
            faqs.append({
                "question": q_text,
                "answer": a_text
            })

    # --- Type 2: Numbered h3 + following <p> ---
    h3_tags = soup.find_all("h3")
    for h3 in h3_tags:
        q_text = h3.get_text(strip=True)
        if q_text and "?" in q_text:
            next_p = h3.find_next_sibling("p")
            if next_p:
                a_text = next_p.get_text(strip=True)
                if a_text:
                    faqs.append({
                        "question": q_text,
                        "answer": a_text
                    })

    return faqs


def crawl_faq_pages():
    categorized_faqs = defaultdict(list)
    links_to_visit = get_internal_links(BASE_URL)
    print(f"Found {len(links_to_visit)} internal links")

    for link in links_to_visit:
        if link in visited:
            continue
        visited.add(link)

        try:
            print(f"\n🔍 Visiting: {link}")
            res = requests.get(link, timeout=10)
            soup = BeautifulSoup(res.text, 'html.parser')

            if is_faq_page(soup):
                print("✅ FAQ page found")
                faqs = extract_faqs(soup)
                if faqs:
                    print(f"✔️ Extracted {len(faqs)} Q&A pairs")
                    category = get_category_name_from_url(link)
                    categorized_faqs[category].extend(faqs)
                else:
                    print("⚠️ FAQ structure found, but no pairs extracted")
            else:
                print("🚫 Skipped (not FAQ page)")
        except Exception as e:
            print(f"[Error] Failed to parse {link}: {e}")
        time.sleep(1)
    return categorized_faqs

# Step 5: Save result
if __name__ == "__main__":
    data = crawl_faq_pages()
    with open("targeted_jupiter_faqs.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n🎉 Done! Extracted {len(data)} FAQs and saved to 'targeted_jupiter_faqs.json'")


Found 42 internal links

🔍 Visiting: https://jupiter.money/calculators/emi-calculator/
✅ FAQ page found
✔️ Extracted 11 Q&A pairs

🔍 Visiting: https://jupiter.money/edge-csb-rupay-credit-card/
🚫 Skipped (not FAQ page)

🔍 Visiting: https://jupiter.money/calculators/personal-loan-emi-calculator/
✅ FAQ page found
✔️ Extracted 7 Q&A pairs

🔍 Visiting: https://jupiter.money/terms-and-conditions/
🚫 Skipped (not FAQ page)

🔍 Visiting: https://jupiter.money/money
✅ FAQ page found
✔️ Extracted 7 Q&A pairs

🔍 Visiting: https://jupiter.money/calculators/
🚫 Skipped (not FAQ page)

🔍 Visiting: https://jupiter.money/calculators/employee-provident-fund-calculator/
🚫 Skipped (not FAQ page)

🔍 Visiting: https://jupiter.money/rewards
✅ FAQ page found
✔️ Extracted 5 Q&A pairs

🔍 Visiting: https://jupiter.money/corporate-salary-account
✅ FAQ page found
✔️ Extracted 12 Q&A pairs

🔍 Visiting: https://jupiter.money/contact-us/
🚫 Skipped (not FAQ page)

🔍 Visiting: https://jupiter.money/flexi-fd
🚫 Skipped (no

## Preprocessing and Cleaning the extracted FAQs

In [2]:
import json
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from bs4 import BeautifulSoup
import numpy as np

# Load data
with open("/kaggle/working/targeted_jupiter_faqs.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Step 1: Cleaning function
def clean_text(text):
    # Remove HTML tags (just in case)
    text = BeautifulSoup(text, "html.parser").get_text()
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Optionally: lowercase
    return text

# Step 2: Deduplicate questions using cosine similarity
def deduplicate_faqs(faqs, threshold=0.9):
    questions = [clean_text(faq["question"]) for faq in faqs]
    vectorizer = TfidfVectorizer().fit_transform(questions)
    similarity_matrix = cosine_similarity(vectorizer)

    # Track questions already grouped
    used = set()
    deduped_faqs = []

    for i, faq in enumerate(faqs):
        if i in used:
            continue
        similar_idxs = np.where(similarity_matrix[i] > threshold)[0]
        best = faq
        # Optional: could merge answers here
        deduped_faqs.append({
            "question": clean_text(best["question"]),
            "answer": clean_text(best["answer"])
        })
        used.update(similar_idxs)

    return deduped_faqs

# Step 3: Apply cleaning + deduplication
cleaned_data = {}

for category, faqs in raw_data.items():
    # print(f"\n🧽 Cleaning category: {category} ({len(faqs)} entries)")
    cleaned_faqs = []
    for faq in faqs:
        q = clean_text(faq["question"])
        a = clean_text(faq["answer"])
        if q and a:
            cleaned_faqs.append({"question": q, "answer": a})

    deduped = deduplicate_faqs(cleaned_faqs)
    # print(f"🧼 After deduplication: {len(deduped)} entries")
    cleaned_data[category] = deduped

# Step 4: Save cleaned data
with open("cleaned_faqs.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print("\n✅ Cleaning & deduplication complete! Saved to 'cleaned_faqs.json'")



✅ Cleaning & deduplication complete! Saved to 'cleaned_faqs.json'


## Building the Bot by utilizing the Mistral Instruct model

In [3]:
%%capture
!pip install openai sentence-transformers faiss-cpu

In [4]:
from huggingface_hub import login
login(token="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")

In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Load cleaned FAQs
with open("cleaned_faqs.json", "r", encoding="utf-8") as f:
    faq_data = json.load(f)

# Flatten to (question, answer, category) triplets
faq_list = []
for category, faqs in faq_data.items():
    for item in faqs:
        faq_list.append({
            "question": item["question"],
            "answer": item["answer"],
            "category": category
        })

# Create embeddings for questions
embedder = SentenceTransformer("all-MiniLM-L6-v2")
question_texts = [faq["question"] for faq in faq_list]
question_embeddings = embedder.encode(question_texts, convert_to_numpy=True)

# Create FAISS index
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(question_embeddings)


def find_similar_question(user_query, top_k=3):
    query_embedding = embedder.encode([user_query])
    D, I = index.search(query_embedding, top_k)
    matches = []
    for i in I[0]:
        matches.append(faq_list[i])
    return matches

2025-06-27 07:58:24.536019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751011104.729626      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751011104.788163      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Model name
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Load tokenizer and model (GPU recommended)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto"  # automatically use float16 if supported
)

# Set up the inference pipeline
mistral_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.7,
    return_full_text=False
)


def rephrase_answer_mistral(question, answer, user_query):
    prompt = f"""<s>[INST] 
You are a helpful support bot for Jupiter, a finance app. Rephrase the following answer to be friendly, human-like, and easy to understand.

Question: {question}
Original Answer: {answer}
User Query: {user_query}

Friendly Response: [/INST]
"""

    response = mistral_pipeline(prompt)[0]["generated_text"]
    return response.strip()



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


## Demonstrating the Solution. 

In [8]:
while True:
    query = input("\nAsk me anything about Jupiter: ")
    if query.lower() in ["exit", "quit"]:
        break

    match = find_similar_question(query, top_k=1)[0]
    rephrased = rephrase_answer_mistral(
        match["question"], match["answer"], user_query=query
    )
    print(f"\n🤖 Bot (category: {match['category']}): {rephrased}")


Ask me anything about Jupiter:  How to use an EMI calculator?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Bot (category: emi-calculator): Hi there! Using an EMI (Equated Monthly Installment) calculator is a breeze. Here's a simple step-by-step guide:

1. First, enter the total amount of your loan in Indian Rupees.
2. Next, add the interest rate as a percentage.
3. Lastly, input the loan tenure in years or months. That's it! The calculator will then compute the monthly EMI for you.

If you need any assistance or have further questions, feel free to ask! 😊



Ask me anything about Jupiter:  quit
