# Credit Scoring Model using Decision Tree

## Objective

#### To predict whether an individual will experience serious delinquency in the next two years based on their past financial behavior.
#### This can help financial institutions assess creditworthiness and manage risk effectively.

In [15]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Dataset Description
#### Source: Give me some credit-Kaggle (cs-training.csv)
#### Target Variable: SeriousDlqin2yrs (1 = serious delinquency in next 2 years, 0 = otherwise)

In [16]:
# Load the data
df = pd.read_csv('cs-training.csv', index_col=0)

### Data Cleaning & Feature Engineering

In [17]:
# Handle missing values - we'll impute them during preprocessing
# But let's check which columns have missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


#### Created new features to enhance model performance:
##### DelinquencyRatio: Total late payments per credit line
##### IncomePerDependent: Adjusted income per household member
##### TotalPastDue: Total instances of late payment
##### Replaced infinite values caused by division with NaN.

In [18]:
# Feature engineering - create new features that might be predictive
# 1. Debt-to-Income ratio (already have DebtRatio)
# 2. Payment delinquency ratio
df['DelinquencyRatio'] = (df['NumberOfTime30-59DaysPastDueNotWorse'] + 
                         df['NumberOfTimes90DaysLate'] + 
                         df['NumberOfTime60-89DaysPastDueNotWorse']) / df['NumberOfOpenCreditLinesAndLoans']
# Replace infinities with NaN (from division by zero)
df['DelinquencyRatio'] = df['DelinquencyRatio'].replace([np.inf, -np.inf], np.nan)

# 3. Credit utilization ratio (already have RevolvingUtilizationOfUnsecuredLines)
# 4. Income per dependent
df['IncomePerDependent'] = df['MonthlyIncome'] / (df['NumberOfDependents'] + 1)  # +1 to avoid division by zero
df['IncomePerDependent'] = df['IncomePerDependent'].replace([np.inf, -np.inf], np.nan)

# 5. Total past due incidents
df['TotalPastDue'] = (df['NumberOfTime30-59DaysPastDueNotWorse'] + 
                      df['NumberOfTimes90DaysLate'] + 
                      df['NumberOfTime60-89DaysPastDueNotWorse'])

In [19]:
# Prepare features and target
X = df.drop(columns=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

#### Train-Test Split
##### Data was split into training (70%) and testing (30%) sets.
##### Stratified sampling was used to preserve class distribution.

In [20]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### Preprocessing Pipeline
#### Built a Pipeline to:
##### Impute missing values using median strategy.
##### Standardize features using StandardScaler.

In [21]:
# Create preprocessing pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Scale features
])

### Model: Decision Tree Classifier
##### Used DecisionTreeClassifier from scikit-learn.

In [22]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

#### Performed hyperparameter tuning with GridSearchCV:
##### Tuned: max_depth, min_samples_split, min_samples_leaf, criterion
##### Scoring metric: F1 score to handle class imbalance
##### 5-fold cross-validation

In [23]:
# Hyperparameter tuning
param_grid = {
    'classifier__max_depth': [3, 5, 7, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy']
}

In [24]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [25]:
# Best model
best_model = grid_search.best_estimator_

### Model Evaluation
#### Evaluated the model on the test set:
##### Confusion Matrix
##### Classification Report: Precision, Recall, F1-score
##### Overall Accuracy

In [26]:
# Evaluate on test set
y_pred = best_model.predict(X_test)
print("\nBest Parameters:", grid_search.best_params_)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Best Parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}

Confusion Matrix:
[[41424   568]
 [ 2356   652]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     41992
           1       0.53      0.22      0.31      3008

    accuracy                           0.94     45000
   macro avg       0.74      0.60      0.64     45000
weighted avg       0.92      0.94      0.92     45000


Accuracy: 0.9350222222222222


### Saving the Model

In [27]:
import joblib

In [28]:
joblib.dump(best_model, 'credit_scoring_model.pkl')

['credit_scoring_model.pkl']

In [29]:
print("Numpy:", np.__version__)


Numpy: 2.2.3


NameError: name 'sklearn' is not defined