In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns
import os
import warnings

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

In [None]:
TRAIN_URL = "https://raw.githubusercontent.com/PolyAI-LDN/task-specific-datasets/master/banking_data/train.csv"
TEST_URL  = "https://raw.githubusercontent.com/PolyAI-LDN/task-specific-datasets/master/banking_data/test.csv"

In [None]:
print("â¬‡  Downloading data directly from Source...")

In [None]:
train_df = pd.read_csv(TRAIN_URL)
test_df = pd.read_csv(TEST_URL)

In [None]:
train_df.rename(columns={'category': 'intent_name'}, inplace=True)
test_df.rename(columns={'category': 'intent_name'}, inplace=True)

In [None]:
os.makedirs('../data', exist_ok=True)
train_df.to_csv('../data/banking77_train.csv', index=False)
test_df.to_csv('../data/banking77_test.csv', index=False)

In [None]:
print(f" Success! Data loaded and saved.")
print(f"   Train Shape: {train_df.shape} (Rows, Cols)")
print(f"   Test Shape:  {test_df.shape} (Rows, Cols)")
print(f"   Total Distinct Intents: {train_df['intent_name'].nunique()}")

In [None]:
print("--- Random Sample of User Queries ---")
sample = train_df[['intent_name', 'text']].sample(5, random_state=42)

for idx, row in sample.iterrows():
    print(f" Intent: {row['intent_name']}")
    print(f"   Query:  \"{row['text']}\"\n")

In [None]:
intent_counts = train_df['intent_name'].value_counts()

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(y=intent_counts.head(20).index, x=intent_counts.head(20).values, palette='viridis')
plt.title("Top 20 Most Frequent Support Issues (High Volume)")
plt.xlabel("Number of Tickets")
plt.show()

In [None]:
max_count = intent_counts.max()
min_count = intent_counts.min()
print(f"Most Frequent: {intent_counts.idxmax()} ({max_count})")
print(f"Least Frequent: {intent_counts.idxmin()} ({min_count})")
print(f"Imbalance Ratio: 1:{max_count/min_count:.1f}")

In [None]:
train_df['word_count'] = train_df['text'].str.split().str.len()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(train_df['word_count'], bins=30, kde=True, color='purple')
plt.axvline(train_df['word_count'].mean(), color='red', linestyle='--', label=f"Mean: {train_df['word_count'].mean():.1f}")
plt.title("Distribution of Query Length (How much context do we have?)")
plt.xlabel("Number of Words")
plt.legend()
plt.show()

In [None]:
print(f"Insight: The average query is only {train_df['word_count'].mean():.1f} words long.")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def plot_top_ngrams(texts, n=2, title="Top Bigrams"):
    vec = CountVectorizer(ngram_range=(n, n), stop_words='english').fit(texts)
    bag_of_words = vec.transform(texts)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)[:10]
    
    df_ngram = pd.DataFrame(words_freq, columns=['Ngram', 'Count'])
    plt.figure(figsize=(8, 4))
    sns.barplot(x='Count', y='Ngram', data=df_ngram, palette='Blues_d')
    plt.title(title)
    plt.show()

In [None]:
intent_a = 'card_arrival'
intent_b = 'card_delivery_estimate'

In [None]:
print(f"Comparing: '{intent_a}' vs '{intent_b}'")
plot_top_ngrams(train_df[train_df['intent_name'] == intent_a]['text'], n=2, title=f"Bigrams: {intent_a}")
plot_top_ngrams(train_df[train_df['intent_name'] == intent_b]['text'], n=2, title=f"Bigrams: {intent_b}")

In [None]:
risk_keywords = ['lost', 'stolen', 'fraud', 'hack', 'compromised']

In [None]:
risk_intents = [i for i in train_df['intent_name'].unique() if any(k in i for k in risk_keywords)]

In [None]:
print(f" High-Risk Intents Identified ({len(risk_intents)}):")
print(risk_intents)

In [None]:
risk_df = train_df[train_df['intent_name'].isin(risk_intents)]
print(f"\nTotal Risk Samples: {len(risk_df)} ({len(risk_df)/len(train_df)*100:.1f}% of data)")

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(y='intent_name', data=risk_df, order=risk_df['intent_name'].value_counts().index, palette='Reds_r')
plt.title("Volume of Security/Fraud Queries")
plt.show()

In [None]:
import re
import string

In [None]:
def senior_cleaning_pipeline(text):
    
    text = text.lower()
    
   
    abbrev_map = {
        r"\batm\b": "atm",
        r"\bpin\b": "pin",
        r"\bcard lost\b": "lost card",
        r"\bstolen\b": "stolen"
    }
    for pattern, replacement in abbrev_map.items():
        text = re.sub(pattern, replacement, text)
        text = re.sub(r'\d+', '<NUM>', text)

        preserve = "$%#"
    all_punct = string.punctuation
    table = str.maketrans('', '', ''.join(c for c in all_punct if c not in preserve))
    text = text.translate(table)

    text = " ".join(text.split())
    
    return text

In [None]:
sample_query = "My card #1234 was stolen! I lost $500. It's NOT working at the atm."
print(f"Raw Query:      {sample_query}")
print(f"Senior Cleaned: {senior_cleaning_pipeline(sample_query)}")

In [None]:
print(" Applying Senior Preprocessing Pipeline...")
train_df['text_clean'] = train_df['text'].apply(senior_cleaning_pipeline)
test_df['text_clean'] = test_df['text'].apply(senior_cleaning_pipeline)

In [None]:
train_df['word_count'] = train_df['text'].str.split().str.len()
test_df['word_count'] = test_df['text'].str.split().str.len()

In [None]:
risk_pattern = r'lost|stolen|compromised|fraud|unauthorized'
train_df['is_high_risk'] = train_df['text_clean'].str.contains(risk_pattern).astype(int)
test_df['is_high_risk'] = test_df['text_clean'].str.contains(risk_pattern).astype(int)

In [None]:
print("\n--- Processed Data Preview ---")
display(train_df[['text', 'text_clean', 'word_count', 'is_high_risk']].head(3))

In [None]:
train_df.to_csv('../data/train_processed.csv', index=False)
test_df.to_csv('../data/test_processed.csv', index=False)

print(f"\n  Files saved to '../data/'.")
print(f"Risk queries in training set: {train_df['is_high_risk'].sum()}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2), 
    max_features=5000, 
    stop_words=None   
)

In [None]:
X_train_tfidf = tfidf.fit_transform(train_df['text_clean'])
X_test_tfidf = tfidf.transform(test_df['text_clean'])

print(f" TF-IDF Matrix Created:")
print(f"   - Training shape: {X_train_tfidf.shape}")
print(f"   - Number of unique features (unigrams/bigrams): {len(tfidf.get_feature_names_out())}")

In [None]:
#EDA and preprocessing done