In [None]:
# 📥 Step 1: Import data from PostgreSQL (Publisher Database)
from sqlalchemy import create_engine
import pandas as pd

# Connect to Publisher (banking_db)
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/banking_db')

query = """
SELECT
    la.applicationid,
    la.loantype,
    la.requestedamount,
    la.status AS application_status,
    l.loanamount,
    l.interestrate,
    l.term,
    l.status AS loan_status
FROM
    loanapplications la
LEFT JOIN
    loan l ON la.applicationid = l.loanapplications_applicationid;
"""

df = pd.read_sql_query(query, engine)
df.head()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("loan_data_export.csv")
df.head()

Unnamed: 0,applicationid,loantype,requestedamount,application_status,loanamount,interestrate,term,loan_status
0,1,Home Loan,100000.0,Pending,100000.0,3.5,240.0,Active
1,2,Car Loan,20000.0,Approved,20000.0,5.0,60.0,Closed
2,3,Personal Loan,5000.0,Rejected,5000.0,6.5,36.0,Rejected
3,4,Home Loan,150000.0,Pending,150000.0,4.0,240.0,Pending
4,5,Car Loan,25000.0,Approved,25000.0,4.5,72.0,Active


In [3]:
df['loan_to_request_ratio'] = df.apply(lambda row: row['loanamount'] / row['requestedamount'] if row['requestedamount'] else 0, axis=1)
df['is_high_interest'] = df['interestrate'].apply(lambda x: 1 if x > 6 else 0)

def term_category(term):
    if term <= 60:
        return 0
    elif term <= 180:
        return 1
    else:
        return 2

df['loan_term_category'] = df['term'].apply(term_category)
df['log_requestedamount'] = np.log1p(df['requestedamount'])
df['log_loanamount'] = np.log1p(df['loanamount'])

features = ['log_requestedamount', 'log_loanamount', 'interestrate', 'loan_to_request_ratio', 'is_high_interest', 'loan_term_category', 'loantype_encoded']
X = df[features]
y = df['high_risk']

KeyError: "['loantype_encoded'] not in index"

In [None]:
# Check shape, types, missing values
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())


In [None]:
print(df.columns)


In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[features + ['high_risk']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap with Target')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

In [None]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    'XGBoost': XGBClassifier(learning_rate=0.05, max_depth=4, n_estimators=200, use_label_encoder=False, eval_metric='logloss'),
    'NaiveBayes': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = {}

for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else [0] * len(y_pred)

    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "report": classification_report(y_test, y_pred),
        "conf_matrix": confusion_matrix(y_test, y_pred)
    }


In [None]:
from sklearn.ensemble import VotingClassifier

# Instantiate individual models
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
xgb = XGBClassifier(learning_rate=0.05, max_depth=4, n_estimators=200, use_label_encoder=False, eval_metric='logloss')
nb = GaussianNB()
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# Combine into a VotingClassifier (soft voting preferred)
ensemble = VotingClassifier(
    estimators=[
        ('RandomForest', rf),
        ('XGBoost', xgb),
        ('NaiveBayes', nb),
        ('DecisionTree', dt),
        ('KNN', knn)
    ],
    voting='soft',  # Use 'hard' for majority voting
    n_jobs=-1
)

# Fit on balanced training data
ensemble.fit(X_train_bal, y_train_bal)

# Predict and evaluate
y_pred_ens = ensemble.predict(X_test)
y_proba_ens = ensemble.predict_proba(X_test)[:, 1]

# Evaluation
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

ensemble_results = {
    "accuracy": accuracy_score(y_test, y_pred_ens),
    "f1_score": f1_score(y_test, y_pred_ens),
    "roc_auc": roc_auc_score(y_test, y_proba_ens),
    "report": classification_report(y_test, y_pred_ens),
    "conf_matrix": confusion_matrix(y_test, y_pred_ens)
}



In [None]:
import joblib

# Save trained model
joblib.dump(model, 'loan_default_model.pkl')
