In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.datasets import  fetch_openml
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.calibration import CalibratedClassifierCV

In [2]:
# Retrieve german credit dataset from openml
X, y = fetch_openml("credit-g", version=1, as_frame=True, parser='auto', return_X_y=True)

one_hot_encoder = make_column_transformer(
        (OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
        make_column_selector(dtype_include='category')),
        remainder='passthrough')

X = one_hot_encoder.fit_transform(X)
scaler = StandardScaler()
data = scaler.fit_transform(X)

# Get the class distribution for the target variable y
class_distribution = y.value_counts().sort_index()

print(class_distribution)

# Define cost matrix
cost_m = [[0, 1], 
            [5, 0]]

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0, stratify=y)

names = ['random forest', 'linear SVM', 'Naive Bayes']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0), 
                SVC(kernel='linear',  probability=True),
                GaussianNB()]

bad     300
good    700
Name: class, dtype: int64


### COST MINIMIZATION

In [16]:
for name, clf in zip(names, classifiers):
    print(name)
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    print("\ncost minimization without probability calibration")
    # Encode labels to match train and test data
    model = clf.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m).T), axis=1)
    print(classification_report(y_test, y_pred))
    conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
    print(conf_m) 
    print("the cost is:")
    print(np.sum(conf_m * cost_m))

    print("\ncost minimization with sigmoid calibration")
    cc = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
    model = cc.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m).T), axis=1)
    print(classification_report(y_test, y_pred))
    conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
    print(conf_m) 
    print("the cost is:")
    print(np.sum(conf_m * cost_m))

    print("\ncost minimization with isotonic calibration")
    cc = CalibratedClassifierCV(clf, method="isotonic", cv=3)
    model = cc.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m).T), axis=1)
    print(classification_report(y_test, y_pred))
    conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
    print(conf_m) 
    print("the cost is:")
    print(np.sum(conf_m * cost_m))

random forest

cost minimization without probability calibration
              precision    recall  f1-score   support

           0       0.41      0.98      0.58        90
           1       0.98      0.40      0.56       210

    accuracy                           0.57       300
   macro avg       0.69      0.69      0.57       300
weighted avg       0.81      0.57      0.57       300

[[ 88 127]
 [  2  83]]
the cost is:
137

cost minimization with sigmoid calibration
              precision    recall  f1-score   support

           0       0.42      0.98      0.58        90
           1       0.98      0.41      0.58       210

    accuracy                           0.58       300
   macro avg       0.70      0.70      0.58       300
weighted avg       0.81      0.58      0.58       300

[[ 88 123]
 [  2  87]]
the cost is:
133

cost minimization with isotonic calibration
              precision    recall  f1-score   support

           0       0.42      0.96      0.59        90
   

It appears that in random forest and Linear SVM cost minimization without calibration performs better compared to the other two methods. However, when using the Naive Bayes classifier, the isotonic calibration approach outperforms the other two methods. Isotonic calibration minimize the cost of Naive Bayes classifier, as the classifier may not generate well-calibrated probability estimates due to its assumption of feature independence being overly simplistic.

### SAMPLING RANDOM FOREST

In [20]:
name = 'random forest'
print(name)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

print("with undersampling")
#change the examples of class 1 to see what happens
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with oversampling")
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with combination")
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_rs, y_rs)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

random forest
with undersampling
Counter({1: 294, 0: 150})
              precision    recall  f1-score   support

           0       0.63      0.46      0.53        90
           1       0.79      0.89      0.84       210

    accuracy                           0.76       300
   macro avg       0.71      0.67      0.68       300
weighted avg       0.74      0.76      0.74       300

[[ 41  24]
 [ 49 186]]
the cost is:
269

with oversampling
Counter({1: 588, 0: 252})




              precision    recall  f1-score   support

           0       0.67      0.38      0.48        90
           1       0.78      0.92      0.84       210

    accuracy                           0.76       300
   macro avg       0.72      0.65      0.66       300
weighted avg       0.74      0.76      0.73       300

[[ 34  17]
 [ 56 193]]
the cost is:
297

with combination
Counter({1: 588, 0: 252})




              precision    recall  f1-score   support

           0       0.66      0.42      0.51        90
           1       0.79      0.90      0.84       210

    accuracy                           0.76       300
   macro avg       0.72      0.66      0.68       300
weighted avg       0.75      0.76      0.74       300

[[ 38  20]
 [ 52 190]]
the cost is:
280



### SAMPLING linear SVM

In [21]:
name = 'linear SVM'
print(name)
clf = SVC(kernel='linear',  probability=True)           
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

print("with undersampling")
#change the examples of class 1 to see what happens
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with oversampling")
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with combination")
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_rs, y_rs)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

linear SVM
with undersampling
Counter({1: 294, 0: 150})
              precision    recall  f1-score   support

           0       0.54      0.51      0.53        90
           1       0.80      0.81      0.80       210

    accuracy                           0.72       300
   macro avg       0.67      0.66      0.67       300
weighted avg       0.72      0.72      0.72       300

[[ 46  39]
 [ 44 171]]
the cost is:
259

with oversampling
Counter({1: 588, 0: 252})




              precision    recall  f1-score   support

           0       0.68      0.46      0.55        90
           1       0.80      0.91      0.85       210

    accuracy                           0.77       300
   macro avg       0.74      0.68      0.70       300
weighted avg       0.76      0.77      0.76       300

[[ 41  19]
 [ 49 191]]
the cost is:
264

with combination
Counter({1: 588, 0: 252})




              precision    recall  f1-score   support

           0       0.55      0.44      0.49        90
           1       0.78      0.84      0.81       210

    accuracy                           0.72       300
   macro avg       0.66      0.64      0.65       300
weighted avg       0.71      0.72      0.71       300

[[ 40  33]
 [ 50 177]]
the cost is:
283



### SAMPLING Naive Bayes

In [3]:
name = 'Naive Bayes'
print(name)
clf = GaussianNB()
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

print("with undersampling")
#change the examples of class 1 to see what happens
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with oversampling")
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

print("with combination")
sampler = RandomUnderSampler(sampling_strategy={0: 150, 1: 294}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
sampler = RandomOverSampler(sampling_strategy={0: 252, 1: 588}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_rs, y_rs)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
print(conf_m)
loss = np.sum(conf_m * cost_m)
print("the cost is:")
print("%d\n" %loss)

Naive Bayes
with undersampling
Counter({1: 294, 0: 150})
              precision    recall  f1-score   support

           0       0.30      0.93      0.45        90
           1       0.68      0.06      0.11       210

    accuracy                           0.32       300
   macro avg       0.49      0.50      0.28       300
weighted avg       0.57      0.32      0.22       300

[[ 84 197]
 [  6  13]]
the cost is:
227

with oversampling
Counter({1: 588, 0: 252})
              precision    recall  f1-score   support

           0       0.33      0.91      0.49        90
           1       0.85      0.22      0.35       210

    accuracy                           0.43       300
   macro avg       0.59      0.57      0.42       300
weighted avg       0.70      0.43      0.39       300

[[ 82 164]
 [  8  46]]
the cost is:
204

with combination
Counter({1: 588, 0: 252})
              precision    recall  f1-score   support

           0       0.31      0.90      0.46        90
           



Based on the results obtained, it seems that in random forest and Linear SVM classifiers, the undersampling method demonstrates better results regarding the cost minimization. This tecnique removes instances from the majority class. As a result, it can rebalance the class distribution and prevent the classifier from being biased towards the majority class. However, the Naive Bayes classifier shows improved performance when using the oversampling technique, which involves duplicating instances from the minority class. This approach can improve the classifier's ability to capture characteristics of the minority class, leading to better results.

In [6]:
names = ['random forest', 'linear SVM', 'Naive Bayes']
for name, clf in zip(names, classifiers):
    print(name)
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    print("\nwith weights")
    # now create the sample weights according to y
    weights = np.zeros(y_train.shape[0])
    weights[np.where(y_train == 1)] = 1;
    weights[np.where(y_train == 0)] = 4;
    model = clf.fit(X_train, y_train, weights)
    pred_test = clf.predict(X_test)
    print(classification_report(y_test, pred_test))
    conf_m = confusion_matrix(y_test, pred_test).T # transpose to align with slides
    print(conf_m)
    loss = np.sum(conf_m * cost_m)
    print("the cost is:")
    print("%d\n" %loss)

random forest

with weights
              precision    recall  f1-score   support

           0       0.66      0.32      0.43        90
           1       0.76      0.93      0.84       210

    accuracy                           0.75       300
   macro avg       0.71      0.63      0.63       300
weighted avg       0.73      0.75      0.72       300

[[ 29  15]
 [ 61 195]]
the cost is:
320

linear SVM

with weights
              precision    recall  f1-score   support

           0       0.35      0.96      0.51        90
           1       0.93      0.24      0.38       210

    accuracy                           0.46       300
   macro avg       0.64      0.60      0.45       300
weighted avg       0.75      0.46      0.42       300

[[ 86 159]
 [  4  51]]
the cost is:
179

Naive Bayes

with weights
              precision    recall  f1-score   support

           0       0.31      0.89      0.46        90
           1       0.76      0.15      0.25       210

    accuracy         

It appears that the linear SVM classifier with weights demonstrates better performance compared to the random forest and Naive Bayes. In the case of linear SVM, which is a margin-based algorithm, applying weights can shift the decision boundary towards the minority class. This leads to better identification of the "bad" customers minimizing the cost associated with misclassification.