In [1]:
# Dependencies and data.
import pandas as pd
from collections import Counter

df = pd.read_csv('Resources/cc_default.csv')
df.head()

Unnamed: 0,ID,ln_balance_limit,sex,education,marriage,age,default_next_month
0,1,9.903488,1,2,0,24,1
1,2,11.695247,1,2,1,26,1
2,3,11.407565,1,2,1,34,0
3,4,10.819778,1,2,0,37,0
4,5,10.819778,0,2,0,57,0


In [2]:
# Split dataset.
y = df['default_next_month']
X = df.drop(columns=['ID', 'default_next_month'], axis=1)

In [3]:
# Split into training and testing sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [4]:
# Import RandomUnderSampler.
from imblearn.under_sampling import RandomUnderSampler

# Initiate sampler, then train with training data before resampling.
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Compare before and after resampling.
print(Counter(y_train))
print(Counter(y_resampled))

Counter({0: 17532, 1: 4968})
Counter({0: 4968, 1: 4968})


In [5]:
# Use a LogisticRegression model.
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [6]:
# Make predictions and generate metrics.
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

y_pred = model.predict(X_test)

print(balanced_accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

0.5981363057701987
[[3732 2100]
 [ 740  928]]
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.64      0.56      0.72      0.60      0.36      5832
          1       0.31      0.56      0.64      0.40      0.60      0.35      1668

avg / total       0.72      0.62      0.57      0.65      0.60      0.36      7500



In [7]:
# Compare with Cluster Centroid Undersampling.
from imblearn.under_sampling import ClusterCentroids

# Create instance and resampled data.
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

# Compare class sizes.
print(Counter(y_train))
print(Counter(y_resampled))

Counter({0: 17532, 1: 4968})
Counter({0: 4968, 1: 4968})


In [8]:
# Fit new model with resampled data.
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [9]:
# Predict and print out metrics.
y_pred = model.predict(X_test)

print(balanced_accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

0.5589565384729254
[[2796 3036]
 [ 603 1065]]
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.48      0.64      0.61      0.55      0.30      5832
          1       0.26      0.64      0.48      0.37      0.55      0.31      1668

avg / total       0.70      0.51      0.60      0.55      0.55      0.30      7500

