## Exploring different techniques to process German Credit Data and predict credit risk
- To predict whether a customer will pay back a loan or credit.

## Installing and importing packages

In [1]:
! pip install catboost --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
! pip install category_encoders --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.2 KB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
! pip install -U scikit-learn --quiet

In [4]:
# importing packages 

import pandas as pd
import numpy as np
import seaborn as sns

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from lightgbm import LGBMClassifier

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours


In [5]:
RANDOM_SEED = 42  #defining seed value to be used so that results can be reproducible 

## Importing dataset and spliting into training and validation dataset

In [6]:
# Loading german credit dataset from uci portal

path = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
df = pd.read_csv(path, header = None, sep=' ')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,...,A124,23,A143,A153,1,A173,1,A192,A201,2


In [7]:
# Assign column names
column_names = ['Checking_acct_status', 'Duration_months', 'Credit_history', 'Purpose', 'Credit_amount',
           'Savings', 'Years_employed', 'Installment_rate', 'Status', 'Debtors', 'Residence', 'Property',
           'Age', 'Other_installment_plans', 'Housing', 'No_existing_credits', 'Job', 'Dependants',
           'Telephone', 'Foreign_worker', 'Target']
# Add header row
df.columns = column_names

df.head()

Unnamed: 0,Checking_acct_status,Duration_months,Credit_history,Purpose,Credit_amount,Savings,Years_employed,Installment_rate,Status,Debtors,...,Property,Age,Other_installment_plans,Housing,No_existing_credits,Job,Dependants,Telephone,Foreign_worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [8]:
# Save the updated DataFrame to a new CSV file
#df.to_csv("german_credit_dataset.csv", index=False)

In [9]:
df.Target.value_counts()  #unbalanced dataset (1 = Good,  2 = Bad credit risk)

1    700
2    300
Name: Target, dtype: int64

In [10]:
# changing target column label to binary 0 and 1
# can also use label encoding intead

df.Target[df['Target']==1] = 0
df.Target[df['Target']==2] = 1
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Target[df['Target']==1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Target[df['Target']==2] = 1


Unnamed: 0,Checking_acct_status,Duration_months,Credit_history,Purpose,Credit_amount,Savings,Years_employed,Installment_rate,Status,Debtors,...,Property,Age,Other_installment_plans,Housing,No_existing_credits,Job,Dependants,Telephone,Foreign_worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


In [11]:
df.nunique()

Checking_acct_status         4
Duration_months             33
Credit_history               5
Purpose                     10
Credit_amount              921
Savings                      5
Years_employed               5
Installment_rate             4
Status                       4
Debtors                      3
Residence                    4
Property                     4
Age                         53
Other_installment_plans      3
Housing                      3
No_existing_credits          4
Job                          4
Dependants                   2
Telephone                    2
Foreign_worker               2
Target                       2
dtype: int64

In [12]:
# separate features and label 
# Use train and validation, replace test with validation
# also k-fold can be used in place of train-validation so that there are more scenarios done and higher accuracy

def split_data(df):
  X = df.drop(['Target','Status'], axis=1) #dropped 'Status' which refers to 'Personal status and sex' of person which likely create bias
  X.head()

  y = df.Target

  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, 
                                                            stratify=y, 
                                                            random_state=RANDOM_SEED)
  return X_train, X_valid, y_train, y_valid

In [13]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
#df.head()

In [14]:
# Select categorical columns
# I think there are 16 categorical variables (minus status) and 3 numerical variables
# 16 categorical variables = 13 nominal + 3 ordinal

cat_cols = ['Checking_acct_status', 'Credit_history', 'Purpose', 'Savings', 'Years_employed', 'Installment_rate', 'Debtors', 'Residence', 
            'Property', 'Other_installment_plans', 'Housing', 'No_existing_credits', 'Job', 'Dependants', 'Telephone', 'Foreign_worker']

nominal_cols = ['Checking_acct_status', 'Purpose', 'Years_employed', 'Installment_rate', 'Debtors', 'Residence', 
            'Property', 'Other_installment_plans', 'Housing', 'No_existing_credits', 'Dependants', 'Telephone', 'Foreign_worker']

num_cols = ['Duration_months', 'Credit_amount', 'Age']

ordinal_cols = ['Credit_history', 'Savings', 'Job'] # years_employed ('Attribute 7') can also be considered ordinal

# Ordinal Encoder
- For features that can represented by integers with a meaning to their order







In [15]:
def Ordinal_Encoder(X_train, X_valid):
  maplist = [{'col': 'Credit_history', 'mapping': {'A30': 4, 'A31': 3,'A32': 2, 'A33': 1, 'A34': 0}},
             {'col': 'Savings', 'mapping': {'A61': 1, 'A62': 2,'A63': 3, 'A64': 4, 'A65': 0}},
             {'col': 'Job', 'mapping': {'A171': 0, 'A172': 1,'A173': 2, 'A174': 3}
              }]
  oe = OrdinalEncoder(mapping=maplist)
  X_train[ordinal_cols] = oe.fit_transform(X_train[ordinal_cols])
  X_valid[ordinal_cols] = oe.transform(X_valid[ordinal_cols])
  return X_train, X_valid


# One hot encoding of categorical columns (specifically nominal_cols)

In [16]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encoding(X_train, X_valid):
  
  enc = OneHotEncoder(handle_unknown='ignore')
  
  # Fit and transform X_train
  X_train_enc = enc.fit_transform(X_train[cat_cols])
  X_train_enc = pd.DataFrame(X_train_enc.toarray(), columns=enc.get_feature_names_out(cat_cols))
  X_train = pd.concat([X_train.reset_index(drop=True), X_train_enc], axis=1)
  X_train.drop(columns=cat_cols, inplace=True)
  
  # Transform X_valid
  X_valid_enc = enc.transform(X_valid[cat_cols])
  X_valid_enc = pd.DataFrame(X_valid_enc.toarray(), columns=enc.get_feature_names_out(cat_cols))
  X_valid = pd.concat([X_valid.reset_index(drop=True), X_valid_enc], axis=1)
  X_valid.drop(columns=cat_cols, inplace=True)

  return X_train, X_valid


# Normalizing numerical columns

- Fit the MinMaxScaler on the training data, then use the scaler to transform both training and testing dataset

##Note
Data transformation with held out data  Just as it is important to test a predictor on data held-out from training, preprocessing (such as standardization, feature selection, etc.) and similar data transformations similarly should be learnt from a training set and applied to held-out data for prediction

In [17]:
def normalize(X_train, X_valid):
  scaler = MinMaxScaler()
  # apply the scaler to the dataset
  X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) #fit and transform on training dataset
  #X_train[num_cols] = scaler.transform(X_train[num_cols]) #transform training dataset
  X_valid[num_cols] = scaler.transform(X_valid[num_cols]) #transform test dataset 

  # print the normalized dataset
  return X_train, X_valid

# Data balancing

- SMOTEENN for combined over and under sampling
- Also tried SMOTE but SMOTEENN performs better

In [18]:
from imblearn.combine import SMOTEENN #Combined over and under sampling

def smoteenn_oversample(X_train, y_train, random_state=42):

    # Instantiate the SMOTEENN object
    smote_enn = SMOTEENN(random_state=RANDOM_SEED, enn=EditedNearestNeighbours(sampling_strategy='majority'))

    # Fit and transform the train dataset
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

    # Return the resampled datasets
    return X_train_resampled, y_train_resampled


# Training the model

In [19]:

def train_classifier(X_train, y_train):
    #model = CatBoostClassifier(random_state=RANDOM_SEED).fit(X_train, y_train, silent=True) #cat_features=cat_cols, 
    #model = XGBClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)
    #model = LogisticRegression(random_state=RANDOM_SEED).fit(X_train, y_train)
    #model = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)
    #model = LinearDiscriminantAnalysis().fit(X_train, y_train)
    #model = svm.SVC(kernel='linear', probability=True, random_state=RANDOM_SEED).fit(X_train, y_train)
    model = GradientBoostingClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)
    #model = LGBMClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)
    #model = RidgeClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)

    return model

# Making predictions using the trained model

In [20]:
# some models don't have predict_proba option

def make_predictions(model, X_valid):
    y_pred = model.predict(X_valid)
    return y_pred

In [21]:
def make_predictions_proba(model, X_valid, threshold):
    y_pred = (model.predict_proba(X_valid) > threshold).astype(int)[:, 1]
    return y_pred

In [22]:
def calculate_fbeta_score(y_valid, y_pred):
    return fbeta_score(y_valid,y_pred, beta=2) # also try beta = 0.5

# Evaluating the model
- We evaluate the model using repeated stratified k-fold cross-testation.
- This provides a better estimate of model performance that is not too optimistically biased compared to a single train-test split.

## K-fold validation
- StratifiedKFold - ideal when there is a large imbalance in the distribution of the target classes
- RepeatedStratifiedKFold is a class in the scikit-learn library that implements a cross-validation method for machine learning models. It is used to split a dataset into multiple train and test sets so that a model can be trained and tested on different subsets of the data.





In [23]:
# evaluating a model

X = df.drop(['Target','Status'], axis=1) #dropped 'Status' which refers to 'Personal status and sex' of person which likely create bias
y = df.Target

#model = CatBoostClassifier(random_state=RANDOM_SEED, cat_features=cat_cols, silent=True)

def evaluate_model_test(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED)
    fbeta_scorer = make_scorer(fbeta_score, beta=2)
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring=fbeta_scorer)
    return scores.mean()


In [24]:
#fbeta_score1 = evaluate_model_test(X, y, model)
#fbeta_score1

# 0.4412712492267319

In [25]:
from sklearn.metrics import make_scorer, fbeta_score

def evaluate_model(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED)
    fbeta_scorer = make_scorer(fbeta_score, beta=0.5)
    scores = []
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[test_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_valid)[:, 1]
        y_pred = (y_pred_proba > 0.35).astype(int)
        score = fbeta_score(y_valid, y_pred, beta=0.5)
        scores.append(score)
    return np.mean(scores)


In [26]:
#fbeta_score2 = evaluate_model(X, y, model)
#fbeta_score2

# 0.5643859105484922

# Using the pipeline

In [27]:
def timestamp():
    return datetime.now().strftime('%H:%M:%S.%f')[:-3]

In [28]:
print(f"{timestamp()}: Starting executing code...")


X_train, X_valid, y_train, y_valid = split_data(df)

#X_train, X_valid = Ordinal_Encoder(X_train, X_valid)

X_train, X_valid = one_hot_encoding(X_train, X_valid)

X_train, X_valid = normalize(X_train, X_valid)
print(X_train.shape, y_train.value_counts())

X_train, y_train = smoteenn_oversample(X_train, y_train)
print(X_train.shape, y_train.value_counts())

model = train_classifier(X_train, y_train)


#y_pred = make_predictions(model, X_valid)
#score = calculate_score(y_valid, y_pred)
#print(score)

results = [] 
for threshold in np.arange(0.02, 0.68, 0.001):
    y_pred = make_predictions_proba(model, X_valid, threshold)
    score = calculate_fbeta_score(y_valid, y_pred)
    results.append(score)
    print(f'{timestamp()}: Threshold = {threshold:.3f}, score = {score:.4f}')

print(f' \n Best F-beta score: {max(results):.4f}') #Best F-beta score displayed at the bottom


04:03:00.499: Starting executing code...
(800, 67) 0    560
1    240
Name: Target, dtype: int64
(778, 67) 1    560
0    218
Name: Target, dtype: int64
04:03:01.293: Threshold = 0.020, score = 0.6928
04:03:01.302: Threshold = 0.021, score = 0.6928
04:03:01.309: Threshold = 0.022, score = 0.6928
04:03:01.316: Threshold = 0.023, score = 0.6944
04:03:01.324: Threshold = 0.024, score = 0.6944
04:03:01.331: Threshold = 0.025, score = 0.6944
04:03:01.338: Threshold = 0.026, score = 0.6944
04:03:01.344: Threshold = 0.027, score = 0.6944
04:03:01.351: Threshold = 0.028, score = 0.6944
04:03:01.358: Threshold = 0.029, score = 0.6944
04:03:01.365: Threshold = 0.030, score = 0.6961
04:03:01.372: Threshold = 0.031, score = 0.6961
04:03:01.380: Threshold = 0.032, score = 0.6977
04:03:01.388: Threshold = 0.033, score = 0.6977
04:03:01.394: Threshold = 0.034, score = 0.6993
04:03:01.402: Threshold = 0.035, score = 0.6993
04:03:01.409: Threshold = 0.036, score = 0.6993
04:03:01.416: Threshold = 0.037, 