In [2]:
!pip install transformers sentence-transformers pandas

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
import json

# Load data (change path if using Google Drive)
with open('nust_bank_data (1).json', 'r') as f:
    bank_data = json.load(f)

# Print the first product and its first Q&A pair
first_product = list(bank_data.keys())[0]
print("First Product:", first_product)
print("\nFirst Question:", bank_data[first_product]['details'][0]['question'])
print("\nFirst Answer:", bank_data[first_product]['details'][0]['answer'])

First Product: Little Champs Account

First Question: I would like to open an account with my son, do u have any product for kids?

First Answer: Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time


In [3]:
import json
import pandas as pd

# Load JSON file
with open('nust_bank_data (1).json', 'r') as f:
    bank_data = json.load(f)

# Convert to a pandas DataFrame
qa_list = []
for product, details in bank_data.items():
    for qa in details['details']:
        qa_list.append({
            'product': product,
            'question': qa['question'],
            'answer': qa['answer']
        })

df = pd.DataFrame(qa_list)
print(f"Total Q&A pairs: {len(df)}")
df.head(3)

Total Q&A pairs: 179


Unnamed: 0,product,question,answer
0,Little Champs Account,"I would like to open an account with my son, d...",Yes our product is Little Champs Account. It i...
1,Little Champs Account,What other Value added features does the Littl...,"Attractive returns on savings account, SMS ale..."
2,Little Champs Account,What is the account type of Little Champs Acco...,This account is offered both in current and sa...


In [4]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?')
    text = text.lower()
    return text

# Apply cleaning to questions and answers
df['clean_question'] = df['question'].apply(clean_text)
df['clean_answer'] = df['answer'].apply(clean_text)

# Show before/after examples
print("Original Question:", df['question'][0])
print("Cleaned Question:", df['clean_question'][0])

Original Question: I would like to open an account with my son, do u have any product for kids?
Cleaned Question: i would like to open an account with my son, do u have any product for kids?


In [5]:
def advanced_clean(text):
    # Protect special patterns (currency, percentages, dates)
    protected = re.sub(r'(Rs\.|PKR|USD)\s*(\d+)', r'\1\2', text)
    protected = re.sub(r'(\d+)\s*%', r'\1%', protected)

    # General cleaning
    cleaned = re.sub(r'[^\w\s.,%$€£¥₹]', ' ', protected)  # Keep basic punctuation
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

df['clean_answer'] = df['answer'].apply(advanced_clean)
print("Protected Example:", df['clean_answer'][1])  # Check for preserved Rs.100/20%

Protected Example: Attractive returns on savings account, SMS alert service on digital transactions, I Net banking services, Free education insurance plan Rs.5,000 per month for 5 years on savings account Rs.10,000 per month for 5 years on current account in case of death of the guardian


In [6]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize(text):
    return nltk.word_tokenize(text)

df['tokenized_question'] = df['clean_question'].apply(tokenize)
df['tokenized_answer'] = df['clean_answer'].apply(tokenize)

print("Tokenized Example:", df['tokenized_answer'][0][:10])  # First 10 tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Tokenized Example: ['Yes', 'our', 'product', 'is', 'Little', 'Champs', 'Account', '.', 'It', 'is']


In [7]:
# Save to CSV
df.to_csv('cleaned_bank_qa.csv', index=False)

# Save to JSON
cleaned_data = df.to_dict('records')
with open('cleaned_bank_qa.json', 'w') as f:
    json.dump(cleaned_data, f)

print("Saved cleaned data!")

Saved cleaned data!


In [8]:
!pip install sentence-transformers pandas numpy



In [9]:
import pandas as pd

# Load the cleaned data (use the file you saved earlier)
df = pd.read_csv('cleaned_bank_qa.csv')  # or pd.read_json('cleaned_bank_qa.json')
print(f"Loaded {len(df)} Q&A pairs")
df.head(2)  # Show first 2 rows
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
import numpy as np

# Convert questions to embeddings
questions = df['clean_question'].tolist()
question_embeddings = model.encode(questions, show_progress_bar=True)

# Save embeddings (so you don't need to recompute)
np.save('question_embeddings.npy', question_embeddings)
print(f"Embeddings shape: {question_embeddings.shape}")  # Should be (num_questions, 384)
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_questions(query, top_k=3):
    # Embed the query
    query_embedding = model.encode([query])

    # Compute similarities
    similarities = cosine_similarity(query_embedding, question_embeddings)[0]

    # Get top-k most similar questions
    top_indices = similarities.argsort()[-top_k:][::-1]

    # Display results
    print(f"Query: '{query}'\nTop {top_k} similar questions:")
    for idx in top_indices:
        print(f"- {df['clean_question'][idx]} (similarity: {similarities[idx]:.2f})")
        print(f"  Product: {df['product'][idx]}\n")

# Test with a sample question
find_similar_questions("How to open a kids account?")
# Save the DataFrame with embeddings
df.to_pickle('bank_qa_with_embeddings.pkl')

Loaded 179 Q&A pairs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Embeddings shape: (179, 384)
Query: 'How to open a kids account?'
Top 3 similar questions:
- i would like to open an account with my son, do u have any product for kids? (similarity: 0.73)
  Product: Little Champs Account

- who can open a roshan digital account? (similarity: 0.57)
  Product: RDA Digital Customer Onboarding

- i would like to inquire about opening a current account for individuals with your bank. please tell me what options i have? (similarity: 0.56)
  Product: Value Plus Current Account (Individual) (VPCA)



In [10]:
!pip install transformers torch sentence-transformers



In [11]:
import torch
from transformers import pipeline

# Load T5 for question-answering
qa_model = pipeline(
    "text2text-generation",
    model="google-t5/t5-small",
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

print("Model loaded!")
def generate_answer(user_query):
    # Find most relevant question (using embeddings)
    query_embedding = model.encode([user_query])
    similarities = cosine_similarity(query_embedding, question_embeddings)[0]
    best_idx = similarities.argmax()

    # Get the corresponding official answer as context
    context = df.iloc[best_idx]['clean_answer']

    # Rephrase it conversationally for the LLM
    prompt = f"""
    You are a helpful bank customer service assistant.
    Based on this context: '{context}'
    Answer this question: '{user_query}'
    """

    generated_answer = qa_model(
        prompt,
        max_length=200,
        temperature=0.7
    )[0]['generated_text']

    return generated_answer
# Example 1
print(generate_answer("How do I open a kids account?"))

# Example 2
print(generate_answer("What's the interest rate for savings accounts?"))
print(generate_answer("What documents are needed to open a kids account?"))
def improved_answer(user_query, similarity_threshold=0.65):
    # Check similarity first
    query_embedding = model.encode([user_query])
    #best_sim = cosine_similarity(query_embedding, question_embeddings)[0].max()

    similarities = cosine_similarity(query_embedding, question_embeddings)[0]
    best_idx = similarities.argmax()
    best_score = similarities[best_idx]

    if best_score < similarity_threshold:
        return "I couldn't find specific information about that. Could you rephrase or ask about another banking topic?"
    else:
        return generate_answer(user_query)

# Test with off-topic question
print(improved_answer("How to cook pasta?"))
print(improved_answer("What documents are required for NAA?"))


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


Model loaded!
A child requires the help of a parental legal guardian to open this account and avail its facilities




'Current profit rate is 20.50% which is paid Semi Annually




Form B, Birth Certificate or Student ID card as appropriate of the minor
I couldn't find specific information about that. Could you rephrase or ask about another banking topic?




'Resident Pakistani individuals who do not maintain have any other account single or joint in NUST Bank Limited are eligible to open the NAA in Pak rupees as a single joint account


In [12]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

SyntaxError: 'return' outside function (<ipython-input-16-e9643f60ec0d>, line 3)

In [14]:
import gradio as gr
def gradio_wrapper(message, history = None):
    return improved_answer(message)

# Launch the chat UI
demo = gr.ChatInterface(
    fn=gradio_wrapper,
    title="Bank Customer Support Bot",
    description="Ask me about accounts, loans, or policies!",
    examples=["What is the eligibility criteria for NAA?", "What documents are required to open a Little Champs Account?"],
    theme="soft",
)

demo.launch(share=True)  # Creates a public link

  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1691ea2469ce12e76b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [17]:
import json
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import nltk
from nltk.tokenize import word_tokenize
import os
import tempfile
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download NLTK resources
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    logger.warning(f"NLTK download error: {e}. Continuing without it.")

# Global variables to store model and data
model = None
df = None
question_embeddings = None
bank_data = {}

# PII detection patterns
PII_PATTERNS = {
    'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phone': r'\b(?:\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b',
    'credit_card': r'\b(?:\d{4}[- ]?){3}\d{4}\b',
    'cnic': r'\b\d{5}[- ]?\d{7}[- ]?\d{1}\b',  # Pakistani CNIC format
    'passport': r'\b[A-Z]{2}\d{7}\b',
    'account_number': r'\b\d{10,18}\b'
}

def mask_pii(text):
    """Mask personally identifiable information in text"""
    if not isinstance(text, str):
        return ""

    masked_text = text
    for pii_type, pattern in PII_PATTERNS.items():
        masked_text = re.sub(pattern, f"[MASKED {pii_type.upper()}]", masked_text)

    return masked_text

# Load data
def load_data(json_path='nust_bank_data (1).json'):
    try:
        with open(json_path, 'r') as f:
            raw_data = json.load(f)
            logger.info(f"Successfully loaded data from {json_path}")
            return raw_data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return {}

# Process JSON data for new document uploads
def process_json_upload(file_path):
    """Process uploaded JSON file"""
    global bank_data, df, model, question_embeddings

    try:
        with open(file_path, 'r') as f:
            new_data = json.load(f)

        # Merge with existing data, or use as new data if empty
        if bank_data:
            for product, details in new_data.items():
                bank_data[product] = details
        else:
            bank_data = new_data

        # Re-process data
        df = process_data(bank_data)

        # Recreate embeddings
        if model is not None:
            question_embeddings = create_embeddings_from_model(df, model)

        return f"Successfully processed JSON with {len(new_data)} products"
    except Exception as e:
        logger.error(f"Error processing JSON upload: {e}")
        return f"Error processing JSON file: {str(e)}"

# Clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Preserve patterns like currency, percentages
    text = re.sub(r'(Rs\.|PKR|USD)\s+(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+)\s+%', r'\1%', text)

    # General cleaning
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()

    return text

# Process data
def process_data(data):
    qa_list = []
    for product, details in data.items():
        for qa in details.get('details', []):
            # Mask any PII in questions and answers
            masked_question = mask_pii(qa.get('question', ''))
            masked_answer = mask_pii(qa.get('answer', ''))

            qa_list.append({
                'product': product,
                'question': masked_question,
                'answer': masked_answer
            })

    df = pd.DataFrame(qa_list)

    # Clean the text
    df['clean_question'] = df['question'].apply(clean_text)
    df['clean_answer'] = df['answer'].apply(clean_text)

    logger.info(f"Processed {len(df)} Q&A pairs")
    return df

# Load model and create embeddings
def load_model():
    try:
        start_time = time.time()
        # Load the sentence transformer model
        model = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info(f"Model loaded in {time.time() - start_time:.2f} seconds")
        return model
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        return None

def create_embeddings_from_model(df, model):
    try:
        start_time = time.time()

        # Generate embeddings
        questions = df['clean_question'].tolist()
        embeddings = model.encode(questions, show_progress_bar=True)

        logger.info(f"Embeddings created in {time.time() - start_time:.2f} seconds")
        return embeddings

    except Exception as e:
        logger.error(f"Error creating embeddings: {e}")
        return None

# Function to detect off-topic or harmful queries
def is_banking_related(query, embeddings, threshold=0.4):
    # Banking-related keywords
    banking_keywords = [
        "account", "bank", "loan", "credit", "debit", "transaction", "balance",
        "deposit", "withdraw", "interest", "transfer", "card", "atm", "savings",
        "current", "statement", "fee", "charge", "branch", "online", "banking",
        "cheque", "check", "payment", "fund", "finance", "money", "cash", "nust"
    ]

    # Check if any banking keyword is in the query
    query_words = query.lower().split()
    if any(keyword in query_words for keyword in banking_keywords):
        return True

    # If no direct keywords, check embedding similarity with known banking questions
    if model is not None and embeddings is not None:
        query_embedding = model.encode([query])
        similarities = cosine_similarity(query_embedding, embeddings)[0]
        max_similarity = similarities.max()

        return max_similarity >= threshold

    return False

# Function to detect potential jailbreak or prompt injection attempts
def detect_harmful_query(query):
    # List of potentially harmful patterns/keywords
    harmful_patterns = [
        "bypass", "override", "ignore", "instructions", "previous", "constraint",
        "forget", "pretend", "disregard", "system prompt", "real AI", "don't follow",
        "don't obey", "classified", "confidential", "secret", "sensitive", "private",
        "prompt injection", "jailbreak", "hack", "exploit", "authentication", "password",
        "admin", "unauthorized", "security", "breach", "backdoor"
    ]

    # Check if any harmful pattern is in the query
    query_lower = query.lower()
    if any(pattern in query_lower for pattern in harmful_patterns):
        logger.warning(f"Harmful pattern detected in query: {query}")
        return True

    # Check for attempts to extract system instructions
    if "system" in query_lower and any(word in query_lower for word in ["instruction", "prompt", "message"]):
        logger.warning(f"System instruction extraction attempt detected: {query}")
        return True

    return False

# Function to get an answer based on the user query
def get_answer(user_query, top_k=3, similarity_threshold=0.55):
    global df, model, question_embeddings

    start_time = time.time()

    # Handle empty query
    if not user_query or not user_query.strip():
        return "Please ask a question about our banking services."

    # Handle missing data or model
    if df is None or model is None or question_embeddings is None:
        return "The system is still initializing. Please try again shortly."

    # Check for harmful queries
    if detect_harmful_query(user_query):
        return "I cannot respond to that type of request. I'm designed to help with banking-related questions only."

    # Clean the user query
    clean_query = clean_text(user_query)

    # Check if the query is banking-related
    if not is_banking_related(clean_query, question_embeddings):
        return "I'm a banking assistant and can only help with banking-related questions. Could you please ask something about our banking products or services?"

    # Embed the query
    query_embedding = model.encode([clean_query])

    # Compute similarities
    similarities = cosine_similarity(query_embedding, question_embeddings)[0]

    # Check if the best match is good enough
    best_similarity = similarities.max()
    if best_similarity < similarity_threshold:
        return "I don't have specific information about that. Could you please rephrase or ask about one of our banking products like accounts, loans, or services?"

    # Get top-k most similar questions
    top_indices = similarities.argsort()[-top_k:][::-1]

    # Compile relevant information from top matches
    relevant_info = []
    for idx in top_indices:
        product = df['product'][idx]
        answer = df['answer'][idx]
        similarity = similarities[idx]

        if similarity >= similarity_threshold * 0.8:  # Only use if it's reasonably similar
            relevant_info.append((product, answer, similarity))

    # Format a direct response
    if relevant_info:
        # Use the best match as primary response
        primary_product, primary_answer, _ = relevant_info[0]

        # Add supplementary info if significantly different
        response = f"Based on information about {primary_product}: {primary_answer}"

        # Add any additional relevant context from other products
        additional_info = set()
        for product, answer, _ in relevant_info[1:]:
            # Only add if it adds new information
            if answer not in primary_answer:
                additional_info.add(f"Additionally, regarding {product}: {answer}")

        if additional_info:
            response += "\n\n" + "\n".join(list(additional_info)[:1])  # Limit additional info
    else:
        response = "I don't have specific information about that. Could you please ask about our banking products or services?"

    logger.info(f"Query answered in {time.time() - start_time:.2f} seconds")
    return response

# Function to handle file uploads
def handle_file_upload(files):
    if not files:
        return "No file uploaded"

    results = []
    for file in files:
        file_ext = os.path.splitext(file.name)[1].lower()

        if file_ext == '.json':
            # Save to temp file
            temp_dir = tempfile.gettempdir()
            temp_path = os.path.join(temp_dir, file.name)

            with open(temp_path, 'wb') as f:
                f.write(file.read())

            # Process the JSON file
            result = process_json_upload(temp_path)
            results.append(result)

            # Clean up
            try:
                os.remove(temp_path)
            except:
                pass
        else:
            results.append(f"Unsupported file type: {file_ext}")

    return "\n".join(results)

# Initialize the system
def initialize_system():
    global bank_data, df, model, question_embeddings

    logger.info("Initializing system...")

    # Load model first (can be done in parallel with data loading)
    model = load_model()
    if model is None:
        logger.error("Failed to load model. System initialization failed.")
        return False

    # Load and process data
    bank_data = load_data()
    if not bank_data:
        logger.warning("No initial bank data loaded.")

    # Process data if available
    if bank_data:
        df = process_data(bank_data)

        # Create embeddings
        question_embeddings = create_embeddings_from_model(df, model)
        if question_embeddings is None:
            logger.error("Failed to create embeddings.")
            return False
    else:
        df = pd.DataFrame(columns=['product', 'question', 'answer', 'clean_question', 'clean_answer'])
        question_embeddings = np.array([])

    logger.info("System initialization completed successfully.")
    return True

# Create the Gradio interface with document upload capability
def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Bank Customer Support Bot")
        gr.Markdown("Ask questions about banking products and services. You can also upload banking data in JSON format.")

        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(height=500)
                msg = gr.Textbox(label="Ask a question", placeholder="What is the Little Champs Account?")
                clear = gr.Button("Clear")

            with gr.Column(scale=1):
                file_upload = gr.File(
                    label="Upload Banking Data (JSON format)",
                    file_types=[".json"],
                    file_count="multiple"
                )
                upload_button = gr.Button("Process Uploaded Files")
                upload_status = gr.Textbox(label="Upload Status", interactive=False)

                with gr.Accordion("Examples", open=False):
                    examples = gr.Examples(
                        examples=[
                            "What is the Little Champs Account?",
                            "What documents are required to open a Little Champs Account?",
                            "How can a minor operate an account?",
                            "Is there any insurance benefit with Little Champs Account?"
                        ],
                        inputs=msg
                    )

        # Define chat function
        def respond(message, chat_history):
            bot_message = get_answer(message)
            chat_history.append((message, bot_message))
            return "", chat_history

        # Connect UI components with functions
        msg.submit(respond, [msg, chatbot], [msg, chatbot])
        clear.click(lambda: None, None, chatbot, queue=False)
        upload_button.click(handle_file_upload, inputs=[file_upload], outputs=[upload_status])

        # Information tab
        with gr.Accordion("About", open=False):
            gr.Markdown("""
            ## Bank Customer Support Bot

            This system is designed to answer banking-related questions based on the provided data.

            ### Features:
            - Answers questions about banking products and services
            - Allows uploading new banking data in JSON format
            - Detects and prevents harmful queries
            - Masks sensitive information automatically

            ### Usage Tips:
            - Be specific with your questions
            - You can upload additional banking data in JSON format
            - Focus on banking-related queries for best results
            """)

    return demo

# Main function to run the application
def main():
    # Initialize the system
    if not initialize_system():
        logger.error("System initialization failed. Exiting.")
        return

    # Create and launch the interface
    interface = create_interface()
    interface.launch(share=True)

if __name__ == "__main__":
    main()

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

  chatbot = gr.Chatbot(height=500)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://054e21efac1f09aba6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [22]:
import json
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import nltk
from nltk.tokenize import word_tokenize
import os
import logging
import time

# ─── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ─── Download NLTK ─────────────────────────────────────────────────────────────
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    logger.warning(f"NLTK download error: {e}. Continuing without it.")

# ─── Globals ───────────────────────────────────────────────────────────────────
model = None
df = None
question_embeddings = None
bank_data = {}

# PII detection patterns
PII_PATTERNS = {
    'email':      r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phone':      r'\b(?:\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b',
    'credit_card':r'\b(?:\d{4}[- ]?){3}\d{4}\b',
    'cnic':       r'\b\d{5}[- ]?\d{7}[- ]?\d{1}\b',
    'passport':   r'\b[A-Z]{2}\d{7}\b',
    'account_number': r'\b\d{10,18}\b'
}

def mask_pii(text):
    if not isinstance(text, str):
        return ""
    for pii_type, pattern in PII_PATTERNS.items():
        text = re.sub(pattern, f"[MASKED {pii_type.upper()}]", text)
    return text

# ─── Data Loading & Processing ─────────────────────────────────────────────────
def load_data(json_path='nust_bank_data (1).json'):
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        logger.info(f"Loaded data from {json_path}")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return {}

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'(Rs\.|PKR|USD)\s+(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+)\s+%', r'\1%', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

def process_data(data):
    rows = []
    for product, details in data.items():
        for qa in details.get('details', []):
            q = mask_pii(qa.get('question', ''))
            a = mask_pii(qa.get('answer', ''))
            rows.append({'product': product, 'question': q, 'answer': a})
    df = pd.DataFrame(rows)
    df['clean_question'] = df['question'].apply(clean_text)
    df['clean_answer']   = df['answer'].apply(clean_text)
    logger.info(f"Processed {len(df)} Q&A pairs")
    return df

# ─── Embedding Model ───────────────────────────────────────────────────────────
def load_model():
    try:
        start = time.time()
        m = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info(f"Model loaded in {time.time()-start:.2f}s")
        return m
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        return None

def create_embeddings_from_model(df, model):
    try:
        start = time.time()
        embs = model.encode(df['clean_question'].tolist(), show_progress_bar=True)
        logger.info(f"Embeddings created in {time.time()-start:.2f}s")
        return embs
    except Exception as e:
        logger.error(f"Error creating embeddings: {e}")
        return None

# ─── Query Filters ──────────────────────────────────────────────────────────────
def detect_harmful_query(q):
    bad = [
        "bypass","override","ignore","forget","pretend","disregard",
        "system prompt","jailbreak","hack","exploit","password","unauthorized"
    ]
    ql = q.lower()
    if any(w in ql for w in bad):
        logger.warning(f"Harmful query detected: {q}")
        return True
    if "system" in ql and any(w in ql for w in ["instruction","prompt","message"]):
        logger.warning(f"System instruction attempt: {q}")
        return True
    return False

def is_banking_related(query, embeddings, threshold=0.4):
    keywords = [
        "account","bank","loan","credit","transaction","balance","deposit",
        "withdraw","interest","transfer","card","atm","savings","statement",
        "branch","online","banking","cheque","payment","fund","money","nust"
    ]
    if any(k in query.lower().split() for k in keywords):
        return True
    if model is not None and embeddings is not None:
        sim = cosine_similarity(model.encode([query]), embeddings)[0].max()
        return sim >= threshold
    return False

# ─── Chat Logic ────────────────────────────────────────────────────────────────
def get_answer(user_query, top_k=3, similarity_threshold=0.55):
    global df, model, question_embeddings
    if not user_query.strip():
        return "Please ask a question about our banking services."
    if df is None or model is None or question_embeddings is None:
        return "System is initializing, please try again shortly."
    if detect_harmful_query(user_query):
        return "I cannot respond to that type of request. I'm designed to help with banking-related questions only."
    cq = clean_text(user_query)
    if not is_banking_related(cq, question_embeddings):
        return "I'm a banking assistant—please ask about banking products or services."
    q_emb = model.encode([cq])
    sims = cosine_similarity(q_emb, question_embeddings)[0]
    if sims.max() < similarity_threshold:
        return "I don't have specific info on that. Could you rephrase or ask about accounts, loans, or services?"
    top_idxs = sims.argsort()[-top_k:][::-1]
    infos = [(df.iloc[i]['product'], df.iloc[i]['answer'], sims[i]) for i in top_idxs if sims[i] >= similarity_threshold * 0.8]
    if not infos:
        return "I don't have detailed info—could you ask about another product?"
    primary = infos[0]
    resp = f"Based on information about {primary[0]}: {primary[1]}"
    extras = {f"Additionally, regarding {p}: {a}" for (p,a,_) in infos[1:]}
    if extras:
        resp += "\n\n" + next(iter(extras))
    return resp

# ─── JSON Upload Handling ──────────────────────────────────────────────────────
def process_json_upload(file_path):
    global bank_data, df, model, question_embeddings
    try:
        with open(file_path, 'r') as f:
            new_data = json.load(f)
        bank_data.update(new_data)
        df = process_data(bank_data)
        question_embeddings = create_embeddings_from_model(df, model)
        return f"Successfully processed JSON with {len(new_data)} products"
    except Exception as e:
        logger.error(f"Error processing JSON: {e}")
        return f"Error processing JSON file: {e}"

def handle_file_upload(files):
    if not files:
        return "No file uploaded."
    results = []
    for file_path in files:  # here `file_path` is a string path
        ext = os.path.splitext(file_path)[1].lower()
        if ext == '.json':
            results.append(process_json_upload(file_path))
        else:
            results.append(f"Unsupported file type: {ext}")
    return "\n".join(results)

# ─── Initialization ───────────────────────────────────────────────────────────
def initialize_system():
    global bank_data, df, model, question_embeddings
    logger.info("Initializing system…")
    model = load_model()
    bank_data = load_data()
    df = process_data(bank_data) if bank_data else pd.DataFrame(columns=['product','question','answer','clean_question','clean_answer'])
    question_embeddings = create_embeddings_from_model(df, model) if not df.empty else np.array([])
    logger.info("Initialization complete.")
    return True

# ─── Gradio Interface ─────────────────────────────────────────────────────────
# Create the Gradio interface with document upload capability
def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Bank Customer Support Bot")
        gr.Markdown("Ask questions about banking products and services. You can also upload banking data in JSON format.")

        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(height=500)
                msg = gr.Textbox(label="Ask a question", placeholder="What is the Little Champs Account?")
                clear = gr.Button("Clear")

            with gr.Column(scale=1):
                file_upload = gr.File(
                    label="Upload Banking Data (JSON format)",
                    file_types=[".json"],
                    file_count="multiple"
                )
                upload_button = gr.Button("Process Uploaded Files")
                upload_status = gr.Textbox(label="Upload Status", interactive=False)

                with gr.Accordion("Examples", open=False):
                    examples = gr.Examples(
                        examples=[
                            "What is the Little Champs Account?",
                            "What documents are required to open a Little Champs Account?",
                            "How can a minor operate an account?",
                            "Is there any insurance benefit with Little Champs Account?"
                        ],
                        inputs=msg
                    )

        # Define chat function
        def respond(message, chat_history):
            bot_message = get_answer(message)
            chat_history.append((message, bot_message))
            return "", chat_history

        # Connect UI components with functions
        msg.submit(respond, [msg, chatbot], [msg, chatbot])
        clear.click(lambda: None, None, chatbot, queue=False)
        upload_button.click(handle_file_upload, inputs=[file_upload], outputs=[upload_status])

        # Information tab
        with gr.Accordion("About", open=False):
            gr.Markdown("""
            ## Bank Customer Support Bot

            This system is designed to answer banking-related questions based on the provided data.

            ### Features:
            - Answers questions about banking products and services
            - Allows uploading new banking data in JSON format
            - Detects and prevents harmful queries
            - Masks sensitive information automatically

            ### Usage Tips:
            - Be specific with your questions
            - You can upload additional banking data in JSON format
            - Focus on banking-related queries for best results
            """)

    return demo

def main():
    initialize_system()
    interface = create_interface()
    interface.launch(share=True)

if __name__ == "__main__":
    main()


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

  chatbot = gr.Chatbot(height=500)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c53908156763a8f28b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
