# Undersampling

Implement the cluster centroids and random undersampling techniques with the credit card default data. Then estimate a logistic regression model and report the classification evaluation metrics from both sampling methods. 

ln_balance_limit is the log of the maximum balance they can have on the card; 1 is female, 0 male for sex; the education is denoted: 1 = graduate school; 2 = university; 3 = high school; 4 = others; 1 is married and 0 single for marriage; default_next_month is whether the person defaults in the following month (1 yes, 0 no).

In [1]:
import pandas as pd
from path import Path
from collections import Counter

In [2]:
data = Path(r'C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 11\3\05-Stu_Do_Undersampling\Resources\cc_default.csv')
df = pd.read_csv(data)

In [3]:
x_cols = [i for i in df.columns if i not in ('ID', 'default_next_month')]
X = df[x_cols]
y = df['default_next_month']

In [4]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Random Undersampling

In [5]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=1)

X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

Counter(y_undersampled)

Counter({0: 4968, 1: 4968})

In [6]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0, 
                           class_weight=None, 
                           dual=False, 
                           fit_intercept=True,
                           intercept_scaling=1, 
                           l1_ratio=None, 
                           max_iter=100,
                           n_jobs=None,
                           penalty='l2',                           
                           random_state=1, 
                           solver='lbfgs',
                           tol=0.0001, 
                           verbose=0,
                           warm_start=False)

model.fit(X_undersampled, y_undersampled)

LogisticRegression(random_state=1)

In [7]:
# Display the confusion matrix
predictions = model.predict(X_test)

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predictions)

array([[3732, 2100],
       [ 740,  928]], dtype=int64)

In [8]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.5981363057701987

In [9]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.64      0.56      0.72      0.60      0.36      5832
          1       0.31      0.56      0.64      0.40      0.60      0.35      1668

avg / total       0.72      0.62      0.57      0.65      0.60      0.36      7500



## Cluster Centroid Undersampling

In [15]:
# Fit the data using `ClusterCentroids` and check the count of each class
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)

X_resample_cluster, y_resample_cluster = cc.fit_resample(X_train, y_train)

Counter(y_resample_cluster)

Counter({0: 4968, 1: 4968})

In [17]:
model2 = LogisticRegression(C=1.0,
                            class_weight=None,
                            dual=False, 
                            fit_intercept=True,
                            intercept_scaling=1,
                            l1_ratio=None, 
                            max_iter=100,
                            n_jobs=None,
                            penalty='l2',
                            random_state=1, 
                            solver='lbfgs',
                            tol=0.0001,
                            verbose=0,
                            warm_start=False)

model2.fit(X_resample_cluster, y_resample_cluster)

LogisticRegression(random_state=1)

In [18]:
# Display the confusion matrix
predictions2 = model2.predict(X_test)

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predictions2)

array([[2842, 2990],
       [ 610, 1058]], dtype=int64)

In [19]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions2)

0.560801975703388

In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions2))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.49      0.63      0.61      0.56      0.30      5832
          1       0.26      0.63      0.49      0.37      0.56      0.31      1668

avg / total       0.70      0.52      0.60      0.56      0.56      0.31      7500

