In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.inspection import permutation_importance

# 1. Load the Dataset
try:
    try:
        df = pd.read_csv('data/2025_Sterling_Financial_Dataset_clean.csv')
    except FileNotFoundError:
        df = pd.read_csv('2025_Sterling_Financial_Dataset_clean.csv')
    print("Dataset loaded successfully.")
    print("Columns found:", df.columns.tolist())
except FileNotFoundError:
    print("Error: File not found. Please upload '2025_Sterling_Financial_Dataset_clean.csv'")

# 2. Define X (Features) and y (Target)
target = 'default_history'

# Columns to drop (ID, Date, and raw Text are not useful for KNN)
# Kept 'sentiment' so that can drop the raw 'customer_feedback'
cols_to_drop = [target, 'customer_id', 'date', 'customer_feedback']

# Only drop columns that actually exist to avoid KeyError
existing_drop_cols = [col for col in cols_to_drop if col in df.columns]
X = df.drop(columns=existing_drop_cols)
y = df[target]

# 3. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training shape: {X_train.shape}")
print(f"Testing shape: {X_test.shape}")

Dataset loaded successfully.
Columns found: ['date', 'customer_id', 'location', 'business_sector', 'age', 'income', 'credit_score', 'savings_ratio', 'loan_amount', 'debt_to_income', 'credit_utilization', 'payment_punctuality', 'customer_feedback', 'default_history', 'sentiment', 'risk_category', 'customer_segment', 'feedback_topic']
Training shape: (1200, 14)
Testing shape: (300, 14)


Prepocessing Pipeline

In [5]:
# 1. Identify which columns are numbers vs text automatically
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print(f"Found {len(numeric_features)} numeric features: {list(numeric_features)}")
print(f"Found {len(categorical_features)} categorical features: {list(categorical_features)}")

# 2. Define the Transformers
# StandardScaler: Compresses Income/Age to 0-1 range (CRITICAL for KNN)
# OneHotEncoder: Converts text like "Urban/Rural" to numbers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 3. Create the Main Pipeline
# ImbPipeline was used because it handles SMOTE correctly during Cross-Validation
knn_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # Generates synthetic samples for defaults
    ('knn', KNeighborsClassifier())
])

print("Pipeline created successfully.")

Found 8 numeric features: ['age', 'income', 'credit_score', 'savings_ratio', 'loan_amount', 'debt_to_income', 'credit_utilization', 'payment_punctuality']
Found 6 categorical features: ['location', 'business_sector', 'sentiment', 'risk_category', 'customer_segment', 'feedback_topic']
Pipeline created successfully.
