## More Loans

In this activity you will practice using random and SMOTE oversampling in combination with logistic regression to predict whether or not someone is likely to default on their credit card loans in a given month given demographic information. 

ln_balance_limit is the log of the maximum balance they can have on the card; 1 is female, 0 male for sex; the education is denoted: 1 = graduate school; 2 = university; 3 = high school; 4 = others; 1 is married and 0 single for marriage; default_next_month is whether the person defaults in the following month (1 yes, 0 no).

In [1]:
import pandas as pd
from path import Path
import pandas as pd
from collections import Counter

In [2]:
data = Path(r'C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 11\3\04-Stu_Do_More_Loans\Resources\cc_default.csv')
df = pd.read_csv(data)

In [3]:
x_cols = [i for i in df.columns if i not in ("ID", "default_next_month")]
X = df[x_cols]
y = df["default_next_month"]

In [4]:
# train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
Counter(y_train)

Counter({0: 17532, 1: 4968})

### Random Oversampling

In [8]:
# Fit the RandomOverSampler to the data and check the count of each class
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(f"{Counter(X_resampled)}")

print(f"{Counter(y_resampled)}")

Counter({'ln_balance_limit': 1, 'sex': 1, 'education': 1, 'marriage': 1, 'age': 1})
Counter({0: 17532, 1: 17532})


In [15]:
# Fit a logistic regression model using random oversampled data
from sklearn.linear_model import LogisticRegression


#model = LogisticRegression(solver='lbfgs', random_state=1)

model = LogisticRegression(C=1.0,
                           class_weight=None, 
                           dual=False, 
                           fit_intercept=True,
                           intercept_scaling=1, 
                           l1_ratio=None,
                           max_iter=100,
                           n_jobs=None,
                           penalty='l2',
                           random_state=1,                           
                           solver='lbfgs', 
                           tol=0.0001,
                           verbose=0,
                           warm_start=False)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
# Display the confusion matrix
predictions = model.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predictions 0", "Predictions 1"])

cm_df.head()

Unnamed: 0,Predictions 0,Predictions 1
Actual 0,3744,2088
Actual 1,745,923


In [17]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_acc_score = balanced_accuracy_score(y_test, predictions)

balanced_acc_score

0.5976663113953282

In [18]:
# Print the classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.64      0.73      5832
           1       0.31      0.55      0.39      1668

    accuracy                           0.62      7500
   macro avg       0.57      0.60      0.56      7500
weighted avg       0.72      0.62      0.65      7500



In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.64      0.55      0.73      0.60      0.36      5832
          1       0.31      0.55      0.64      0.39      0.60      0.35      1668

avg / total       0.72      0.62      0.57      0.65      0.60      0.36      7500



### SMOTE Oversampling

In [28]:
# Fit the SMOTE model to the data and check the count of each class
from imblearn.over_sampling import SMOTE

X_resampled2, y_resampled2 = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(X_train, y_train)

print(f"X_resampled: {Counter(X_resampled2)}")
print(f"y_resampled: {Counter(y_resampled2)}")

X_resampled: Counter({'ln_balance_limit': 1, 'sex': 1, 'education': 1, 'marriage': 1, 'age': 1})
y_resampled: Counter({0: 17532, 1: 17532})


In [29]:
# Fit a logistic regression model using the SMOTE resampled data
model = LogisticRegression(C=1.0,
                           class_weight=None, 
                           dual=False,
                           fit_intercept=True,
                           intercept_scaling=1,
                           l1_ratio=None, 
                           max_iter=100,
                           n_jobs=None, 
                           penalty='l2',
                           random_state=1,
                           solver='lbfgs',
                           tol=0.0001, 
                           verbose=0,
                           warm_start=False)

model.fit(X_resampled2, y_resampled2)

LogisticRegression(random_state=1)

In [30]:
# Display the confusion matrix
predictions2 = model.predict(X_test)

cm2 = confusion_matrix(y_test, predictions2)

cm_df2 = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Prediction 0", "Prediction 1"])

cm_df2

Unnamed: 0,Prediction 0,Prediction 1
Actual 0,3744,2088
Actual 1,745,923


In [32]:
# Calculate the balanced accuracy score
balanced_acc_score2 = balanced_accuracy_score(y_test, predictions2)

balanced_acc_score2

0.5975337014339146

In [33]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions2))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.63      0.56      0.72      0.60      0.36      5832
          1       0.30      0.56      0.63      0.40      0.60      0.35      1668

avg / total       0.72      0.62      0.58      0.65      0.60      0.36      7500

