In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load dataset from CSV
# Replace 'loan_data.csv' with your actual filename
data = pd.read_csv('loan_data.csv')

# Step 2: Define features and target
X = data.drop(['loan_status', 'loan_id'], axis=1)  # drop ID and target
y = data['loan_status']  # Approved/Rejected

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Preprocessing
cat_features = ['education', 'self_employed']
num_features = [col for col in X.columns if col not in cat_features]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(), cat_features)
])

# Step 5: Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Step 6: Train initial model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

print("Initial Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 7: Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5]
}
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Step 8: Evaluate tuned model
y_pred_tuned = grid_search.predict(X_test)
print("Tuned Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))

# ROC-AUC
y_prob = grid_search.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score((y_test=='Approved').astype(int), y_prob)
print("ROC-AUC:", roc_auc)

fpr, tpr, _ = roc_curve((y_test=='Approved').astype(int), y_prob)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Step 9: Feature importance
best_model = grid_search.best_estimator_.named_steps['classifier']
encoded_cat_features = grid_search.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_features)
all_features = num_features + list(encoded_cat_features)

importances = best_model.feature_importances_
feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)

print("\nFeature Importances:")
print(feat_imp)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Feature Importance in Loan Approval Prediction')
plt.show()
