#  Scrape FAQs from Jupiter Contact Page using BeautifulSoup 

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://jupiter.money/contact/"
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")

faq_data = []
ul = soup.find('ul', attrs={'data-controller': 'faq'})
if ul:
    for li in ul.find_all('li'):
        btn = li.find('button')
        if btn:
            question_span = btn.find('span', attrs={'class': lambda x: x and 'text-black100' in x})
            question = question_span.get_text(" ", strip=True) if question_span else btn.get_text(" ", strip=True)
        else:
            continue
        answer_p = li.find('p', attrs={'data-faq-target': True})
        answer = answer_p.get_text(" ", strip=True) if answer_p else ""
        if question and answer:
            faq_data.append({'question': question, 'answer': answer})

df = pd.DataFrame(faq_data)
print(f"✅ Scraped {len(df)} FAQs")
print(df)
df.to_csv("all_files/final_jupiter_faqs.csv", index=False)


✅ Scraped 11 FAQs
                                             question  \
0                              What is Jupiter Money?   
1   Is 'Jupiter Bank' approved by\n            the...   
2      How can I open a Savings\n            account?   
3                         How can I get a Debit card?   
4   How to deposit cash in the\n            Saving...   
5   How can I transfer money from\n            Jup...   
6                                  Is Jupiter a Bank?   
7     What is Jupiter’s Whatsapp\n            number?   
8   How can I apply for a credit\n            card...   
9   How can I get a Federal Bank\n            pass...   
10  How can I set a PIN for my\n            Debit ...   

                                               answer  
0   Jupiter is the 1-app for everything money that...  
1   Jupiter is itself not a bank and doesn’t hold ...  
2   To open a free Savings or Salary Bank Account ...  
3   You can order a new physical Debit Card by tap...  
4   To deposit ca

#  Clean and preprocess scraped FAQ data 

In [6]:
import pandas as pd
import re

# 1. Load raw data
raw_df = pd.read_csv("all_files/final_jupiter_faqs.csv")

# 2. Data Quality Assessment
missing_q = raw_df['question'].isnull().sum()
missing_a = raw_df['answer'].isnull().sum()
print(f"Missing questions: {missing_q}, Missing answers: {missing_a}")

# 3. Cleaning and Normalization
def clean(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)          # normalize whitespace
    text = re.sub(r'[+\-]', '', text)         # remove stray + or -
    text = re.sub(r'<.*?>', '', text)         # remove HTML tags
    text = text.strip()
    return text

raw_df['question'] = raw_df['question'].apply(clean)
raw_df['answer']   = raw_df['answer'].apply(clean)

# 4. Deduplication
dedup_df = raw_df.drop_duplicates(subset='question').reset_index(drop=True)

# 5. Remove empty or very short Q&A pairs
final_df = dedup_df[
    (dedup_df['question'].str.len() > 10) &
    (dedup_df['answer'].str.len() > 15)
].copy()

# 6. Final Save & Inspection
final_df.to_csv("all_files/jupiter_faqs_cleaned.csv", index=False)
print(f"Rows after full cleaning: {len(final_df)}")
display(final_df.head())


Missing questions: 0, Missing answers: 0
Rows after full cleaning: 11


Unnamed: 0,question,answer
0,What is Jupiter Money?,Jupiter is the 1app for everything money that ...
1,Is 'Jupiter Bank' approved by the RBI?,Jupiter is itself not a bank and doesn’t hold ...
2,How can I open a Savings account?,To open a free Savings or Salary Bank Account ...
3,How can I get a Debit card?,You can order a new physical Debit Card by tap...
4,How to deposit cash in the Savings account?,To deposit cash into your Savings or Salary Ba...


#  Generate sentence embeddings for each FAQ question 

In [7]:


import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Load your cleaned FAQ CSV
df = pd.read_csv("all_files/jupiter_faqs_cleaned.csv")

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast, accurate, works well for FAQ

# Create embeddings for each question
questions = df['question'].tolist()
embeddings = model.encode(questions, show_progress_bar=True)

# Optionally save embeddings to file for later use
np.save("all_files/faq_question_embeddings.npy", embeddings)

print(f"Created {len(embeddings)} embeddings.")


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.56it/s]

Created 11 embeddings.





#  Define function to retrieve FAQ using semantic search (NumPy) 

In [2]:


import numpy as np

# Load embeddings (if not already in memory)
faq_embeddings = np.load("all_files/faq_question_embeddings.npy")

# Your original DataFrame
# df = pd.read_csv("jupiter_faqs_cleaned.csv")  # Already loaded previously

def retrieve_faq(user_query, model, faq_embeddings, questions, threshold=0.60, top_k=1):
    # Embed user query
    user_emb = model.encode([user_query])[0]
    # Compute cosine similarities
    sims = np.dot(faq_embeddings, user_emb) / (np.linalg.norm(faq_embeddings, axis=1) * np.linalg.norm(user_emb))
    # Get top-k most similar indices
    top_indices = np.argsort(sims)[-top_k:][::-1]
    results = []
    for idx in top_indices:
        results.append({
            "question": questions[idx],
            "answer": df.iloc[idx]["answer"],
            "similarity": sims[idx]
        })
    # Check if the best similarity is above threshold
    if results[0]["similarity"] < threshold:
        return None, None, results[0]["similarity"]
    return results[0]["question"], results[0]["answer"], results[0]["similarity"]

# Example usage:
user_query = "How do I order a credit card?"
retrieved_q, retrieved_a, similarity = retrieve_faq(
    user_query, model, faq_embeddings, questions, threshold=0.60
)

print("Retrieved Question:", retrieved_q)
print("Retrieved Answer:", retrieved_a)
print("Similarity Score:", similarity)


Retrieved Question: How can I get a Debit card?
Retrieved Answer: You can order a new physical Debit Card by tapping on the ‘Card’ tab on the Jupiter app. While you can get a virtual Debit Card for free, you will be charged a onetime fee when ordering a physical Debit Card
Similarity Score: 0.6820835


#  Build a FAISS index for scalable semantic search 

In [3]:
import faiss
import numpy as np

faq_embeddings = np.load("all_files/faq_question_embeddings.npy").astype('float32')
faq_embeddings = faq_embeddings / np.linalg.norm(faq_embeddings, axis=1, keepdims=True)

index = faiss.IndexFlatIP(faq_embeddings.shape[1])
index.add(faq_embeddings)

# Save index to file
faiss.write_index(index, "faq_faiss.index")
print("FAISS index saved to faq_faiss.index")


FAISS index saved to faq_faiss.index


# FAISS-based retrieval function for FAQs 

In [24]:


def retrieve_faq_faiss(user_query, model, index, df, faq_embeddings, threshold=0.65, top_k=1):
    user_emb = model.encode([user_query]).astype('float32')
    user_emb = user_emb / np.linalg.norm(user_emb)
    D, I = index.search(user_emb.reshape(1, -1), top_k)  # D = similarities, I = indices
    similarity = float(D[0][0])
    idx = int(I[0][0])
    if similarity < threshold:
        return None, None, similarity
    return df.iloc[idx]["question"], df.iloc[idx]["answer"], similarity

# Example usage
user_query = "Is Jupiter bank?"
retrieved_q, retrieved_a, similarity = retrieve_faq_faiss(
    user_query, model, index, df, faq_embeddings, threshold=0.65
)

print("Similarity:", similarity)
print("Question:", retrieved_q)
print("Answer:", retrieved_a)


Similarity: 0.9623639583587646
Question: Is Jupiter a Bank?
Answer: Jupiter is itself not a bank and doesn’t hold or claim to hold a banking license. The Savings Account and VISA Debit Card are provided by Federal Bank and follows all security standards as applicable. All funds in your account are insured up to the approved limit by DICGC. Your money is always safe with Federal Bank Jupiter enables you to make smart money decisions every day using easy, intuitive, and personalized money management tools that help eliminate the stress, fear and confusion that comes with managing money. You can start using Jupiter by doing any of the following… Opening a free no minimum balance Savings Bank Account on Jupiter  powered by Federal Bank  in 3 minutes Get 5x Cashback on spends* Or, opening a free Salary Bank Account on Jupiter  powered by Federal Bank Withdraw your salary any day with OnDemand Salary Get free health insurance up to Rs. 2,00,000 Get 5x Cashback on spends*


# Retrieve FAQ and use LLM to rephrase the answer 

In [8]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import openai

# Load your cleaned FAQ data and embeddings
df = pd.read_csv("all_files/jupiter_faqs_cleaned.csv")
faq_embeddings = np.load("all_files/faq_question_embeddings.npy")
questions = df['question'].tolist()

# Load the same embedding model as before
model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_faq(user_query, model, faq_embeddings, questions, threshold=0.65, top_k=1):
    user_emb = model.encode([user_query])[0]
    sims = np.dot(faq_embeddings, user_emb) / (np.linalg.norm(faq_embeddings, axis=1) * np.linalg.norm(user_emb))
    top_indices = np.argsort(sims)[-top_k:][::-1]
    results = []
    for idx in top_indices:
        results.append({
            "question": questions[idx],
            "answer": df.iloc[idx]["answer"],
            "similarity": sims[idx]
        })
    if results[0]["similarity"] < threshold:
        return None, None, results[0]["similarity"]
    return results[0]["question"], results[0]["answer"], results[0]["similarity"]

def openai_llm_answer(user_query, retrieved_q, retrieved_a, api_key, model_name="gpt-3.5-turbo"):
    if retrieved_q is None or retrieved_a is None:
        return "Sorry, I don't know the answer to that. Please contact Jupiter support!"
    prompt = (
        "You are a helpful, friendly FAQ assistant for Jupiter Money (the Indian fintech app, not the planet). "
        "Rephrase the FAQ answer below in a user-friendly, conversational tone, but make sure you keep all the details and information present in the original answer. "
        "Do not omit any facts or steps. "
        "If the answer is already clear, just return it as is. "
        "If there is no relevant answer, politely say you don't know.\n\n"
        f"User asked: {user_query}\n"
        f"FAQ question: {retrieved_q}\n"
        f"FAQ answer: {retrieved_a}\n"
        "Final user-facing answer:"
    )
    client = openai.OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3
    )
    return response.choices[0].message.content.strip()

# Example test
openai_api_key = input("Enter your OpenAI API key: ")

user_query = "How can I get debit card?"

retrieved_q, retrieved_a, similarity = retrieve_faq(
    user_query, model, faq_embeddings, questions, threshold=0.65
)

print("Similarity Score:", similarity)
print("Retrieved Question:", retrieved_q)
print("Retrieved Answer:", retrieved_a)
print("\nLLM Bot's Answer:")
print(openai_llm_answer(user_query, retrieved_q, retrieved_a, api_key=openai_api_key))


Similarity Score: 0.9864107
Retrieved Question: How can I get a Debit card?
Retrieved Answer: You can order a new physical Debit Card by tapping on the ‘Card’ tab on the Jupiter app. While you can get a virtual Debit Card for free, you will be charged a onetime fee when ordering a physical Debit Card

LLM Bot's Answer:
Sure thing! To get a Debit Card, simply open the Jupiter app and tap on the ‘Card’ tab. You can order a virtual Debit Card for free, but if you want a physical Debit Card, there will be a one-time fee.


# Compare retrieval vs LLM-based answers on accuracy and latency without Multilingual support

In [None]:
import numpy as np
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
import openai

# ---- CONFIG ----
openai_api_key =  input("Enter your OpenAI API key: ") # <-- Set your key here for testing
openai_llm_model = "gpt-3.5-turbo"

# Load data and embeddings
df = pd.read_csv("all_files/jupiter_faqs_cleaned.csv")
faq_embeddings = np.load("all_files/faq_question_embeddings.npy")
questions = df['question'].tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieval_only_answer(user_query, model, faq_embeddings, questions, df, threshold=0.65):
    user_emb = model.encode([user_query])[0]
    sims = np.dot(faq_embeddings, user_emb) / (np.linalg.norm(faq_embeddings, axis=1) * np.linalg.norm(user_emb))
    idx = np.argmax(sims)
    similarity = sims[idx]
    if similarity < threshold:
        return None, None, similarity
    return questions[idx], df.iloc[idx]['answer'], similarity

def openai_llm_answer(user_query, retrieved_q, retrieved_a, api_key, model_name="gpt-3.5-turbo"):
    if retrieved_q is None or retrieved_a is None:
        return "Sorry, I don't know the answer to that. Please contact Jupiter support!"
    prompt = (
        "You are a helpful, friendly FAQ assistant for Jupiter Money (the Indian fintech app, not the planet). "
        "Rephrase the FAQ answer below in a user-friendly, conversational tone, but make sure you keep all the details and information present in the original answer. "
        "Do not omit any facts or steps. "
        "If the answer is already clear, just return it as is. "
        "If there is no relevant answer, politely say you don't know.\n\n"
        f"User asked: {user_query}\n"
        f"FAQ question: {retrieved_q}\n"
        f"FAQ answer: {retrieved_a}\n"
        "Final user-facing answer:"
    )
    client = openai.OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3
    )
    return response.choices[0].message.content.strip()

# --- List of diverse test queries ---
test_queries = [
    "How do I order a Jupiter debit card?",
    "What is the KYC process for a new account?",
    "Can I get cashback on purchases?",
    "What are my daily transfer limits?",
    "How to set a PIN for my card?",
    "Is Jupiter an RBI-approved bank?",
    "How can I deposit cash into my account?",
    "How do I block my debit card if lost?",
    "How to contact Jupiter support?",
    "What's the process to update my address?",
]

# --- Evaluate both approaches and compare ---
results = []
for user_query in test_queries:
    # Retrieval-only baseline
    t0 = time.time()
    ret_q, ret_a, ret_sim = retrieval_only_answer(
        user_query, model, faq_embeddings, questions, df, threshold=0.65
    )
    t1 = time.time()
    retrieval_time = (t1 - t0) * 1000  # ms

    # LLM-based answer (OpenAI)
    t2 = time.time()
    llm_answer = openai_llm_answer(user_query, ret_q, ret_a, api_key=openai_api_key, model_name=openai_llm_model)
    t3 = time.time()
    llm_time = (t3 - t2) * 1000  # ms

    results.append({
        "user_query": user_query,
        "retrieval_question": ret_q,
        "retrieval_answer": ret_a,
        "retrieval_similarity": ret_sim,
        "retrieval_time_ms": int(retrieval_time),
        "llm_answer": llm_answer,
        "llm_time_ms": int(llm_time),
    })

comparison_df = pd.DataFrame(results)
print(comparison_df[["user_query", "retrieval_answer", "retrieval_similarity", "retrieval_time_ms", "llm_answer", "llm_time_ms"]])
comparison_df.to_csv("faq_comparison_results.csv", index=False)


                                   user_query  \
0        How do I order a Jupiter debit card?   
1  What is the KYC process for a new account?   
2            Can I get cashback on purchases?   
3          What are my daily transfer limits?   
4               How to set a PIN for my card?   
5            Is Jupiter an RBI-approved bank?   
6     How can I deposit cash into my account?   
7       How do I block my debit card if lost?   
8             How to contact Jupiter support?   
9    What's the process to update my address?   

                                    retrieval_answer  retrieval_similarity  \
0  You can order a new physical Debit Card by tap...              0.731035   
1                                               None              0.385424   
2                                               None              0.359884   
3                                               None              0.358048   
4  You can set/ reset your Debit Card PIN by tapp...              0.88

## Retrieval vs LLM-Based FAQ Bot: Accuracy and Latency

- **Retrieval-only** (semantic search) provides accurate answers nearly instantly.
- **LLM-based** (OpenAI) gives friendlier, more conversational answers, with slightly higher latency due to the API call.


See the `faq_comparison_results.csv` for detailed results.
