In [None]:
#@title 1. Installing Required Libraries
# Install necessary libraries for web scraping, language model interaction, and vector search.

!pip install googletrans==4.0.0rc1 # Install specific compatible version
!pip install requests beautifulsoup4 pandas
!pip install accelerate # Add accelerate for faster loading
!pip install transformers torch
!pip install -U sentence-transformers
!pip install faiss-cpu



#@title 2. Scrape FAQs from Jupiter Money Website
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_jupiter_faqs():
    """
    Scrapes the FAQ section from the Jupiter Money contact page.
    Returns:
        list: A list of dictionaries, where each dictionary represents a category
              and contains the category title and a list of Q&A pairs.
              Returns an empty list if scraping fails.
    """
    print("Starting FAQ scraping...")
    URL = "https://jupiter.money/contact/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(URL, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    faq_data = []
    faq_list = soup.find('ul', attrs={'data-controller': 'faq'})

    if not faq_list:
        print("No FAQ list found with 'data-controller=faq'. The website structure might have changed.")
        return []

    category_title = "Frequently Asked Questions"
    qna_pairs = []

    faq_items = faq_list.find_all('li')

    print(f"Found {len(faq_items)} FAQ items.")

    for item in faq_items:
        question_tag = item.find('button').find('span')
        answer_tag = item.find('p')

        if question_tag and answer_tag:
            question = question_tag.get_text(strip=True)
            answer = answer_tag.get_text(strip=True)

            question = re.sub(r'\s+', ' ', question).strip()
            answer = re.sub(r'\s+', ' ', answer).strip()

            if question and answer:
                qna_pairs.append({"question": question, "answer": answer})

    if qna_pairs:
        faq_data.append({
            "category": category_title,
            "qna": qna_pairs
        })
        print(f"  - Scraped {len(qna_pairs)} Q&As from '{category_title}'")

    print("Scraping finished successfully.")
    return faq_data

# Execute the scraping function
scraped_data = scrape_jupiter_faqs()

# Convert the structured data into a flat DataFrame for easier processing
if scraped_data:
    flat_data = []
    for category_info in scraped_data:
        for qna in category_info['qna']:
            flat_data.append({
                "category": category_info['category'],
                "question": qna['question'],
                "answer": qna['answer']
            })

    df_faqs = pd.DataFrame(flat_data)
    print("\nFAQ data converted to DataFrame (first 5 rows):")
    print(df_faqs.head())
    print(f"\nTotal FAQs scraped: {len(df_faqs)}")
else:
    print("\nCould not scrape any data. Please check the URL and website structure.")
    df_faqs = pd.DataFrame() # Create an empty dataframe to avoid errors later

#@title 3. Preprocess and Clean the Data
def preprocess_and_clean(df):
    """
    Cleans and preprocesses the FAQ DataFrame.
    """
    if df.empty:
        print("DataFrame is empty. Skipping preprocessing.")
        return df

    print("\nStarting preprocessing and cleaning...")
    df['question'] = df['question'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    df['answer'] = df['answer'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

    df['question'] = df['question'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    df['answer'] = df['answer'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

    initial_rows = len(df)
    df.drop_duplicates(subset=['question'], inplace=True, keep='first')
    final_rows = len(df)

    print(f"Removed {initial_rows - final_rows} duplicate questions.")
    print("Preprocessing finished.")
    return df

if not df_faqs.empty:
    df_cleaned = preprocess_and_clean(df_faqs.copy())
    print("\nCleaned DataFrame sample:")
    print(df_cleaned.head())
    df_cleaned.to_csv("jupiter_faqs_cleaned.csv", index=False)
    print("\nCleaned data saved to 'jupiter_faqs_cleaned.csv'")
else:
    df_cleaned = pd.DataFrame()


#@title 4. Build Semantic Search Index with FAISS
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

def build_faiss_index(df):
    """
    Builds a FAISS index for semantic search using question embeddings.
    """
    if df.empty or 'question' not in df.columns:
        print("DataFrame is empty or missing 'question' column. Cannot build index.")
        return None, None

    print("\nBuilding FAISS index for semantic search...")
    model_name = 'all-MiniLM-L6-v2'

    model_path = os.path.join(os.path.expanduser('~/.cache/torch/sentence_transformers'), model_name.replace('/', '_'))
    if not os.path.exists(model_path):
        print(f"Downloading SentenceTransformer model: {model_name}...")

    model = SentenceTransformer(model_name)

    print("Generating embeddings for all questions...")
    questions = df['question'].tolist()
    embeddings = model.encode(questions, convert_to_tensor=True, show_progress_bar=True)

    embeddings_np = embeddings.cpu().numpy().astype('float32')

    d = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(d)

    index.add(embeddings_np)

    print(f"FAISS index built successfully with {index.ntotal} vectors.")
    return index, model

if not df_cleaned.empty:
    faiss_index, embedding_model = build_faiss_index(df_cleaned)
else:
    faiss_index, embedding_model = None, None

def search_faqs(query_in_english, index, model, df, top_k=3):
    """
    Searches for relevant FAQs using the FAISS index.
    """
    if index is None or model is None:
        print("FAISS index or embedding model not initialized. Cannot perform search.")
        return []

    query_embedding = model.encode([query_in_english], convert_to_tensor=True).cpu().numpy().astype('float32')

    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        results.append({
            "question": df.iloc[i]['question'],
            "answer": df.iloc[i]['answer'],
            "distance": distances[0][np.where(indices[0] == i)[0][0]]
        })
    return results

#@title 5. Setup the LLM (microsoft/Phi-4-mini-instruct) for Response Generation
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from googletrans import Translator

# Initialize Google Translator globally
google_translator = Translator()

def setup_llm_pipeline():
    """
    Sets up the LLM pipeline for generating answers.
    Uses 'microsoft/Phi-4-mini-instruct'.
    """
    print("Setting up the LLM...")
    model_name = "microsoft/Phi-4-mini-instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype="auto",
            trust_remote_code=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
    except Exception as e:
        print(f"Error loading model '{model_name}': {e}")
        print("Please ensure the model name is correct, you have sufficient resources (RAM/GPU),")
        print("and you have accepted its license on the Hugging Face Hub if it's a gated model.")
        return None

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    print("LLM pipeline ready.")
    return pipe

# Initialize the LLM
llm_pipeline = setup_llm_pipeline()

#@title 6. Build the RAG-based FAQ Bot with Suggestion Capabilities
import time # Keep for internal profiling, remove if truly not needed anywhere
import math

class FAQBot:
    """
    A class that encapsulates the entire FAQ Bot functionality, with googletrans handling
    auto-detection, translation to English for search, and translation back to user's language.
    Includes functionality for suggesting related queries based on user history and directly from FAQs.
    """
    def __init__(self, df, index, model, pipe):
        self.df = df
        self.index = index
        self.embedding_model = model
        self.llm_pipe = pipe

        if self.df.empty or self.index is None or self.embedding_model is None:
            raise ValueError("Bot components are not initialized. Please run previous cells successfully.")
        if self.llm_pipe is None:
            raise ValueError("LLM pipeline not initialized. Cannot run bot without it.")

        self.query_history = [] # Stores {'query': str (original), 'embedding': np.array (English), 'language': str (detected), 'english_search_query': str}
        self.pending_direct_faq_suggestion = None # Stores (original_user_query, matched_faq_q_a, detected_language)
        self.direct_faq_threshold = 0.85 # Cosine similarity threshold for direct FAQ suggestion

    def _auto_detect_and_translate_to_english(self, query):
        """
        Uses googletrans to detect the language of the query and translate it to English.
        Returns a tuple: (detected_language_code, translated_english_query)
        """
        try:
            detected = google_translator.detect(query)
            detected_lang_code = detected.lang
            # print(f"Detected language: {detected_lang_code}") # Internal debug print

            if detected_lang_code != 'en':
                # print(f"Translating query from {detected_lang_code} to English...") # Internal debug print
                translated_obj = google_translator.translate(query, dest='en')
                translated_text = translated_obj.text
                # print(f"Translated query to English: '{translated_text}'") # Internal debug print
                return detected_lang_code, translated_text
            else:
                # print("Query is already in English.") # Internal debug print
                return 'en', query
        except Exception as e:
            print(f"Error during language detection or translation to English: {e}")
            return 'en', query # Fallback

    def _translate_english_response_back_and_combine(self, english_response, target_language_code):
        """
        Translates an English response back to the target language if not English,
        and combines both English and the translated version into a single string.
        """
        if target_language_code == 'en':
            # print("Target language is English, providing only English response.") # Internal debug print
            return english_response

        # print(f"Translating English response to {target_language_code} and combining...") # Internal debug print
        try:
            translated_obj = google_translator.translate(english_response, dest=target_language_code)
            translated_text = translated_obj.text

            combined_response = (
                f"English: {english_response}\n\n"
                f"Your language ({target_language_code}): {translated_text}"
            )
            # print("Combined response generated.") # Internal debug print
            return combined_response
        except Exception as e:
            print(f"Error translating response back to {target_language_code}: {e}")
            return f"Sorry, I am having trouble translating my answer to your language right now, but here is the English response:\n\n{english_response}"

    def _generate_english_llm_response(self, original_query_for_context, context, llm_pipe):
        """
        Generates a conversational answer in English using the LLM with RAG.
        This function explicitly instructs the LLM to generate in English.
        """
        if not llm_pipe:
            return "Sorry, my brain isn't working right now. Please try again later."
        if not context:
            return "I couldn't find any specific information about that in the Jupiter Help Centre. Can you try asking in a different way?"

        # print("\nGenerating final response with LLM in English...") # Internal debug print

        context_str = "\n\n".join([f"Question: {c['question']}\nAnswer: {c['answer']}" for c in context])

        prompt = f"""
You are a friendly and helpful customer support assistant for Jupiter, a digital banking app.
Your job is to answer the user's question in a simple, conversational way, based ONLY on the provided context from the Jupiter Help Centre.
Do not make up information. If the context does not contain the answer, clearly state that you couldn't find the information.
It is crucial that your response is in **English**.

Here is the relevant information I found from the Jupiter Help Centre:
--- CONTEXT ---
{context_str}
--- END CONTEXT ---

Now, please answer the following user's question.

User's Question: "{original_query_for_context}"

Your Answer (in conversational and friendly English):
"""

        generation_args = {
            "max_new_tokens": 250,
            "return_full_text": False,
            "temperature": 0.5,
            "do_sample": True,
            "top_p": 0.9,
            "eos_token_id": llm_pipe.tokenizer.eos_token_id,
        }

        try:
            output = llm_pipe(prompt, **generation_args)
            response = output[0]['generated_text'].strip()

            response = re.sub(r'Your Answer \(in conversational and friendly English\):', '', response, flags=re.IGNORECASE).strip()
            response = re.sub(r'English:', '', response, flags=re.IGNORECASE).strip()

        except Exception as e:
            print(f"Error during LLM generation: {e}")
            response = "I am having trouble generating a response right now. Please try again after some time."

        # print("English LLM response generated.") # Internal debug print
        return response

    def _search_faq_for_direct_suggestion(self, query_in_english):
        """
        Searches the FAQ questions directly for a very close semantic match.
        If a strong match is found, it returns the matched FAQ question and its answer.
        """
        if self.index is None or self.embedding_model is None:
            print("FAISS index or embedding model not initialized. Cannot perform direct FAQ search.")
            return None, None, 0.0

        # print(f"Searching FAQs for a direct suggestion (threshold: {self.direct_faq_threshold})...") # Internal debug print
        query_embedding = self.embedding_model.encode([query_in_english], convert_to_tensor=True).cpu().numpy().astype('float32')

        distances, indices = self.index.search(query_embedding, 1)

        if indices[0][0] != -1:
            matched_idx = indices[0][0]
            matched_question = self.df.iloc[matched_idx]['question']
            matched_answer = self.df.iloc[matched_idx]['answer']

            matched_embedding = self.embedding_model.encode([matched_question], convert_to_tensor=True).cpu().numpy().astype('float32')

            # Compute cosine similarity
            # Handle zero-norm case
            norm_query = np.linalg.norm(query_embedding)
            norm_matched = np.linalg.norm(matched_embedding)

            if norm_query == 0 or norm_matched == 0:
                cosine_sim = 0.0 # Or handle as an error/no similarity
            else:
                cosine_sim = np.dot(query_embedding.flatten(), matched_embedding.flatten()) / (norm_query * norm_matched)

            # print(f"Top FAQ match: '{matched_question}' with similarity: {cosine_sim:.3f}") # Internal debug print

            if cosine_sim >= self.direct_faq_threshold:
                return {"question": matched_question, "answer": matched_answer}, cosine_sim
        return None, 0.0

    def suggest_related_queries(self, current_query_english_embedding, num_suggestions=3, min_similarity=0.75):
        """
        Suggests related queries based on historical queries that are semantically similar
        to the current query. Excludes the current query itself and deduplicates suggestions.
        """
        if len(self.query_history) < 2:
            return []

        history_embeddings = []
        history_original_queries = []

        # Exclude the very last query (the current one being processed) from history for suggestions
        for past_query_data in self.query_history[:-1]:
            history_embeddings.append(past_query_data['embedding'])
            history_original_queries.append(past_query_data['query'])

        if not history_embeddings:
            return []

        history_embeddings_np = np.array(history_embeddings).astype('float32')

        current_query_english_embedding_2d = current_query_english_embedding.reshape(1, -1)

        d = history_embeddings_np.shape[1]
        temp_index = faiss.IndexFlatL2(d)
        temp_index.add(history_embeddings_np)

        # Search for similar queries in history. Get enough candidates to filter.
        distances, indices = temp_index.search(current_query_english_embedding_2d, num_suggestions * 5)

        suggested_queries_set = set()
        for idx in indices[0]:
            if idx == -1: continue

            # Recalculate cosine similarity for precision
            # Handle zero-norm case for embeddings
            norm_current = np.linalg.norm(current_query_english_embedding)
            norm_past = np.linalg.norm(history_embeddings_np[idx])

            if norm_current == 0 or norm_past == 0:
                cosine_sim = 0.0
            else:
                cosine_sim = np.dot(current_query_english_embedding, history_embeddings_np[idx]) / (norm_current * norm_past)


            if cosine_sim >= min_similarity:
                suggested_query_original_text = history_original_queries[idx]
                # Avoid suggesting the exact query the user just asked (based on its original form)
                if suggested_query_original_text.lower().strip() != self.query_history[-1]['query'].lower().strip():
                    suggested_queries_set.add(suggested_query_original_text)
                if len(suggested_queries_set) >= num_suggestions:
                    break

        return list(suggested_queries_set)


    def ask(self, user_query):
        """
        Handles a user query from end to end, with auto-detection, translation for search,
        and provides direct FAQ answers or RAG-generated answers.
        Manages the direct FAQ suggestion flow.
        Returns:
            str: The final bot response.
        """
        original_user_query = user_query
        start_time_e2e = time.time() # For internal profiling

        # --- Handle pending direct FAQ suggestion ---
        if self.pending_direct_faq_suggestion:
            prev_original_query = self.pending_direct_faq_suggestion['original_query']
            matched_faq_q_a = self.pending_direct_faq_suggestion['suggested_faq_q_a']
            prev_detected_language = self.pending_direct_faq_suggestion['detected_language']

            # Translate 'yes'/'no' to the detected language from the previous turn for robust check
            try:
                translated_yes = google_translator.translate("yes", dest=prev_detected_language).text.lower().strip()
                translated_no = google_translator.translate("no", dest=prev_detected_language).text.lower().strip()
            except Exception as e:
                # Fallback if translation for 'yes'/'no' fails
                translated_yes = "yes"
                translated_no = "no"
                print(f"Warning: Could not translate 'yes'/'no' for comparison due to {e}. Using English.")


            self.pending_direct_faq_suggestion = None # Clear the pending state after processing user's reply to suggestion

            if user_query.lower().strip() == 'yes' or user_query.lower().strip() == translated_yes:
                # User accepted the suggestion
                final_answer = self._translate_english_response_back_and_combine(matched_faq_q_a['answer'], prev_detected_language)
                end_time_e2e = time.time()
                # print(f"Internal: Handled direct FAQ acceptance. Latency: {end_time_e2e - start_time_e2e:.2f}s") # Internal debug print
                return final_answer
            else:
                # User rejected the suggestion or typed something else. Proceed with RAG for the *previous* query.
                # print("Internal: User rejected direct FAQ suggestion. Proceeding with RAG for previous query.") # Internal debug print
                detected_language_for_rag, query_for_rag_search = self._auto_detect_and_translate_to_english(prev_original_query)
                # Fall through to the RAG section below, using prev_original_query and detected_language_for_rag
        else:
            # Not in a pending state, process the new query
            detected_language_for_rag, query_for_rag_search = self._auto_detect_and_translate_to_english(user_query)


        query_embedding_np = self.embedding_model.encode([query_for_rag_search], convert_to_tensor=True).cpu().numpy().astype('float32')[0]

        # Log the query and its English embedding for suggestion capabilities
        self.query_history.append({
            'query': original_user_query, # Store original query
            'embedding': query_embedding_np,
            'language': detected_language_for_rag,
            'english_search_query': query_for_rag_search
        })

        # --- Try to find a direct FAQ match for suggestion for the *current* query (if not handling a rejected suggestion) ---
        if self.pending_direct_faq_suggestion is None: # Only suggest if not already in a suggestion flow
            matched_faq_q_a, similarity = self._search_faq_for_direct_suggestion(query_for_rag_search)
            if matched_faq_q_a and similarity >= self.direct_faq_threshold:
                # Set the pending state for the next user input
                self.pending_direct_faq_suggestion = {
                    'original_query': original_user_query,
                    'suggested_faq_q_a': matched_faq_q_a,
                    'detected_language': detected_language_for_rag
                }

                # Formulate the "Did you mean?" suggestion message in the user's language
                try:
                    suggested_q_translated = google_translator.translate(matched_faq_q_a['question'], dest=detected_language_for_rag).text
                    yes_option = google_translator.translate("yes", dest=detected_language_for_rag).text
                    no_option = google_translator.translate("no", dest=detected_language_for_rag).text
                except Exception as e:
                    print(f"Warning: Could not translate suggestion to {detected_language_for_rag} due to {e}. Using English for suggestion.")
                    suggested_q_translated = matched_faq_q_a['question']
                    yes_option = "yes"
                    no_option = "no"

                suggestion_message = (
                    f"Did you mean: **'{suggested_q_translated}'**?\n"
                    f"(English: '{matched_faq_q_a['question']}')\n\n"
                    f"Please type '{yes_option}' for Yes, or '{no_option}' for No."
                )
                end_time_e2e = time.time()
                # print(f"Internal: Direct FAQ suggestion issued. Latency: {end_time_e2e - start_time_e2e:.2f}s") # Internal debug print
                return suggestion_message

        # --- If no direct FAQ suggestion, or if handling a rejected suggestion, proceed with RAG ---
        retrieved_context = search_faqs(query_for_rag_search, self.index, self.embedding_model, self.df, top_k=3)

        english_llm_response = self._generate_english_llm_response(original_user_query, retrieved_context, self.llm_pipe)
        final_answer_str = self._translate_english_response_back_and_combine(english_llm_response, detected_language_for_rag)

        # --- Add historical suggestions (if any) ---
        suggestions = self.suggest_related_queries(query_embedding_np)
        if suggestions:
            suggestion_str = "\n\n💡 Perhaps you also want to know:\n"
            for i, s in enumerate(suggestions):
                suggestion_str += f"- {i+1}. {s}\n"
            final_answer_str += suggestion_str

        end_time_e2e = time.time()
        # print(f"Internal: RAG answer generated. Latency: {end_time_e2e - start_time_e2e:.2f}s") # Internal debug print
        return final_answer_str


# Initialize the bot if all components are ready
jupiter_bot = None
if not df_cleaned.empty and faiss_index is not None and embedding_model is not None and llm_pipeline is not None:
    try:
        jupiter_bot = FAQBot(df_cleaned, faiss_index, embedding_model, llm_pipeline)
        print("\nJupiter FAQ Bot is ready to answer your questions!")
    except ValueError as e:
        print(f"\nBot initialization failed: {e}")
else:
    print("\nBot could not be initialized due to errors in previous steps. Please check the logs above.")

#@title 7. Simple Command-Line Interface (CLI)

if jupiter_bot:
    print("\nType your question, or 'quit' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            print("Exiting chat. Goodbye!")
            break

        response = jupiter_bot.ask(user_input)
        print(f"Bot: {response}")
else:
    print("\nCannot start chat. Bot was not initialized successfully.")