<a href="https://colab.research.google.com/github/Tejaswi37/2303A51944-Batch-27-/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np


In [15]:
from pathlib import Path

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [20]:
# Load
csv_path = Path("synthetic_personal_finance_dataset.csv")
if not csv_path.exists():
    alt = Path("synthetic_personal_finance_dataset (1).csv")
    if alt.exists():
        csv_path = alt
df = pd.read_csv(csv_path)


In [21]:
# Features
features = [
    "monthly_income_usd",
    "monthly_expenses_usd",
    "savings_usd",
    "debt_to_income_ratio",
    "savings_to_income_ratio",
]

In [22]:
df = df.dropna(subset=features).copy()

In [23]:
# Quantiles
q_income_low, q_income_high = df["monthly_income_usd"].quantile([0.33, 0.67])
q_sav_low, q_sav_high = df["savings_usd"].quantile([0.33, 0.67])
q_dti_high = df["debt_to_income_ratio"].quantile(0.67)


In [41]:
def assign_financial_profile(income, savings, dti,
                             q_income_low, q_income_high,
                             q_sav_low, q_sav_high, q_dti_high):
    if (income >= q_income_high) and (savings >= q_sav_high) and (dti < q_dti_high):
        return "HI_HS"   # High Income – High Savings (financially healthy)

    if (income >= q_income_high) and (dti >= q_dti_high) and (savings < q_sav_high):
        return "HI_HD"   # High Income – High Debt (at risk despite high earning)

    if (q_income_low <= income < q_income_high) and (dti >= q_dti_high):
        return "MI_HD"   # Mid Income – High Debt (financially vulnerable)

    if (income <= q_income_low) and (savings <= q_sav_low):
        return "LI_LS"   # Low Income – Low Savings (most vulnerable group)

    return "Other"       # Residual group (doesn’t fit above strict categories)


In [42]:
df["profile_label"] = df.apply(label_row, axis=1)


In [43]:
focus_classes = ["HI_HS", "HI_HD", "MI_HD", "LI_LS"]
df_focus = df[df["profile_label"].isin(focus_classes)].copy()
if len(df_focus) < 2000:
    df_focus = df.copy()

In [44]:
X = df_focus[features].copy()
y = df_focus["profile_label"].copy()

In [45]:
# Split
stratify = y if y.nunique() <= 20 and y.value_counts().min() >= 2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify
)

In [46]:
# Models
models = {
    "LogisticRegression": Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=200, multi_class="auto"))]),
    "KNN": Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=11))]),
    "SVC_RBF": Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="rbf"))]),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "GaussianNB": Pipeline([("scaler", StandardScaler()), ("clf", GaussianNB())]),
    "LDA": Pipeline([("scaler", StandardScaler()), ("clf", LinearDiscriminantAnalysis())]),
}

In [47]:
# Train & Evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("=" * 80)
    print(name)
    print("-" * 80)
    print(classification_report(y_test, y_pred, digits=3))




LogisticRegression
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

       HI_HD      0.973     0.948     0.960       191
       HI_HS      0.997     1.000     0.999       672
       LI_LS      0.997     0.998     0.997       934
       MI_HD      0.980     0.984     0.982       547

    accuracy                          0.991      2344
   macro avg      0.987     0.982     0.984      2344
weighted avg      0.991     0.991     0.991      2344

KNN
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

       HI_HD      0.989     0.932     0.960       191
       HI_HS      0.999     1.000     0.999       672
       LI_LS      0.993     0.994     0.993       934
       MI_HD      0.966     0.982     0.974       547

    accuracy                          0.988      2344
   macro avg      0.986     0.977     0.981      2344
