# Credit Risk Modeling Using Machine Learning – Full Python Data Science Project (Step-by-Step)

## Youtube Link:  https://youtu.be/lhI999ETwmA?si=lNKZTXFXd2OsCiK6

### "In this project, we’ll build a Credit Risk Modeling system using Python and Machine Learning from scratch. You’ll learn how to process real-world financial data, apply Scikit-learn for model training, and deploy an interactive dashboard with Streamlit."

## Dataset: https://www.kaggle.com/datasets/kabure/german-credit-data-with-risk

### About the dataset: "It's a classic dataset of Good and Bad Loans"
### Features -
### Primary ID key, Age (Int), Sex (Male / Female), Job (Int), Housing (Own / Rent / Free), Savings Account (NA / Little / Moderate / Rich / Quite Rich), Credit Account (NA / Little / Moderate / Rich / Quite Rich), Credit Amount (Int), Duration (Int), Purpose (Categ.), Target: Risk (good / bad) 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn as skl
import joblib
from sklearn.preprocessing import StandardScaler


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
sns.set_style('whitegrid')

## EDA and Feature Engineering

In [None]:
df = pd.read_csv('german_credit_data.csv')

In [None]:
df.head()

In [None]:
df["Age"].describe()

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df.head()

In [None]:
df["Risk"].value_counts()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe

In [None]:
df["Job"].unique()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df[['Age', "Credit amount", "Duration"]].hist(bins=20, figsize=(12,6), edgecolor='black')

In [None]:
plt.figure(figsize=(10,6))
for i, col in enumerate(['Age', 'Credit amount', 'Duration']):
    plt.subplot(1, 3, i+1)
    sns.boxplot(y=df[col], color='lightblue')
    plt.title(f'Boxplot of {i}')
plt.tight_layout()
plt.show()

In [None]:
df.query('Duration >= 60')

In [None]:
for i, col in enumerate(categorical_cols):
    plt.subplot(2, 3, i+1)
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col, hue='Risk', palette='Set2', order = df[col].value_counts().index)
    plt.title(f'Countplot of {col} by Risk')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
corr = df[["Age", "Job", "Credit amount", "Duration"]].corr()
corr

In [None]:
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
df.groupby('Job')['Credit amount'].mean()

In [None]:
df.groupby('Sex')['Credit amount'].mean()

In [None]:
pd.pivot_table(df, values='Credit amount', index='Housing', columns = "Purpose", aggfunc=np.mean)

In [None]:
sns.scatterplot(data=df, x='Age', y='Credit amount', hue='Sex', palette='Set1', size = 'Duration', sizes=(20, 200))

In [None]:
sns.violinplot(data=df, x='Saving accounts', y='Credit amount', palette='Set3')

In [None]:
df["Risk"].value_counts(normalize=True, ) * 100

In [None]:
for i, col in enumerate(["Age", "Credit amount", "Duration"]):
    plt.subplot(1, 3, i+1)
    sns.boxplot(data= df, x='Risk', y=col, palette='Set1')
    plt.title(f'Boxplot of {col} by Risk')
    plt.tight_layout()
    plt.show()

In [None]:
df.groupby('Risk')[['Age', 'Credit amount', 'Duration']].mean()

In [None]:

for i, col in enumerate(categorical_cols):
    plt.figure(figsize=(12,10))
    plt.subplot(3, 3, i+1)
    sns.countplot(data=df, x=col, hue='Risk', palette='Set2', order = df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show() 

In [None]:
features = ["Age", "Sex", "Job", "Saving accounts", "Checking account", "Credit amount", "Duration", "Purpose", "Housing"]

In [None]:
target = "Risk"

In [None]:
df_model = df[features + [target]].copy()

In [None]:
df_model.head

In [None]:
df_model = df[features + [target]].copy()

In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
cat_cols = df_model.select_dtypes(include= "object").columns.drop("Risk")

In [None]:
le_dict = {}

In [None]:
cat_cols

In [None]:
for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_dict[col] = le
    joblib.dump(le, f'{col}_encoder.pkl')

In [None]:
le_target = LabelEncoder()

In [None]:
df_model[target] = le_target.fit_transform(df_model[target])

In [None]:
df_model[target]

In [None]:
joblib.dump(le_target, 'target_encoder.pkl')

In [None]:
df_model.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_model.drop(columns=[target])
y = df_model[target]


In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
def train_model(model, param_grid, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return best_model, accuracy, grid.best_params_


In [None]:
dt = DecisionTreeClassifier(random_state=1, class_weight='balanced')
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
best_dt, dt_accuracy, dt_best_params = train_model(dt, dt_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Best Decision Tree Model:", best_dt)

In [None]:
print("DT Accuracy:", dt_accuracy)

In [None]:
rf = RandomForestClassifier(random_state=1, class_weight='balanced', n_jobs=1)


In [None]:
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4] 
}

In [None]:
best_rf, acc_rf, rf_best_params = train_model(rf, rf_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Best Random Forest Params:", rf_best_params)
print("RF Accuracy:", acc_rf)

In [None]:
et = ExtraTreesClassifier(random_state=1, class_weight='balanced', n_jobs=1)

In [None]:
best_et, acc_et, et_best_params = train_model(et, rf_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Best ET params", et_best_params)
print("ET Accuracy:", acc_et)

In [None]:
joblib.dump(best_et, 'best_extra_trees_model.pkl')