# Credit Risk Assessment - Model Development
This notebook will implement a machine learning pipeline to predict loan status (credit risk). We'll use the insights from the EDA and experiment with various machine learning models.

# Import required libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Loading and Preprocessing

In [14]:
# Load the data
df = pd.read_csv('credit_risk_data.csv')

# Separate features and target variable
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# 2. Feature Engineering

We will engineer some new features based on domain knowledge:

1. Debt to Income Ratio: Loan amount divided by income.
2. Age to Employment Ratio: Person's age divided by employment length plus one.

In [15]:
from sklearn.compose import make_column_selector as selector

# Custom transformer for feature engineering after preprocessing
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names=None):
        self.feature_names = feature_names
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = pd.DataFrame(X, columns=self.feature_names)
        X_['debt_to_income'] = X_['loan_amnt'] / X_['person_income']
        X_['age_to_employment'] = X_['person_age'] / (X_['person_emp_length'] + 1)
        return X_.values  # Return as NumPy array for compatibility with the pipeline

# 3. Preprocessing: Handling Categorical and Numerical Data
We'll create pipelines for preprocessing numerical and categorical data.

In [16]:
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. Model Development and Training
We will develop models using Gradient Boosting and XGBoost to predict loan status. We'll also use feature selection and evaluate performance with classification metrics.

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Gradient Boosting Classifier
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_engineer', FeatureEngineer()),
    ('feature_selector', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('model', GradientBoostingClassifier())
])

# Train the model
gb_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_pipeline.predict(X_test)
y_pred_gb_proba = gb_pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("Gradient Boosting Classifier Results:")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_gb_proba))

KeyError: 'loan_amnt'

# 5. Model 2: XGBoost
We'll use XGBoost and compare it with the Gradient Boosting model.

In [10]:
# Model 2: XGBoost Classifier
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_engineer', FeatureEngineer()),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Train the model
xgb_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_pipeline.predict(X_test)
y_pred_xgb_proba = xgb_pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nXGBoost Classifier Results:")
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_xgb_proba))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# 6. Model Comparison and Cross-Validation
We will compare models using cross-validation.

In [None]:
# Cross-validation for model comparison
models = [
    ('Gradient Boosting', gb_pipeline),
    ('XGBoost', xgb_pipeline)
]

for name, model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(f"{name} - Mean ROC AUC: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# 7. Feature Importance
We'll extract and visualize feature importance from the Gradient Boosting model.

In [None]:
# Feature importance for Gradient Boosting Classifier
feature_importance = gb_pipeline.named_steps['model'].feature_importances_
feature_names = (gb_pipeline.named_steps['preprocessor'].get_feature_names_out().tolist() +
                 ['debt_to_income', 'age_to_employment'])

# Sort feature importances in descending order
indices = np.argsort(feature_importance)[::-1]

# Plot top 10 features
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance[indices][:10], y=np.array(feature_names)[indices][:10])
plt.title('Top 10 Feature Importances - Gradient Boosting Classifier')
plt.show()