# Credit Risk Resampling Techniques

In [1]:
import warnings


warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import ClusterCentroids
import pickle





# Read the CSV into DataFrame

In [3]:
# Load the data
file_path = Path('forlex.csv')
df = pd.read_csv(file_path)
df.tail()

Unnamed: 0,ID,LIMIT_BAL,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_pay,Total_Missed_Pymts,Payment_Ratio,Total_Bill_AMT,Total_Payment,%_Amount_Paid,Risk_Level
29995,29996,220000.0,1,39,0,0,0,0,0,0,...,3047.0,5000.0,1000.0,0,0,0.0,725349.0,42550.0,5.866142,
29996,29997,150000.0,2,43,0,0,0,0,0,0,...,129.0,0.0,0.0,0,0,0.0,21182.0,14490.0,68.407138,
29997,29998,30000.0,2,37,1,1,1,0,0,0,...,4200.0,2000.0,3100.0,1,3,50.0,70496.0,31300.0,44.399682,Moderate
29998,29999,80000.0,1,41,1,0,0,0,0,0,...,1926.0,52964.0,1804.0,1,1,16.666667,266611.0,147181.0,55.204399,Low
29999,30000,50000.0,1,46,0,0,0,0,0,0,...,1000.0,1000.0,1000.0,1,0,0.0,230874.0,8308.0,3.5985,


# ***Important***

Notice that the columns "Total Missed Payments" and "Payment Ratio" are now the highest co-related to default. We have successfully created two good features for our data frame!! :) 

In [5]:
df.corr()['default_pay']



ID                   -0.013952
LIMIT_BAL            -0.153520
MARRIAGE             -0.024339
AGE                   0.013890
PAY_0                 0.368117
PAY_2                 0.338022
PAY_3                 0.293583
PAY_4                 0.275495
PAY_5                 0.266959
PAY_6                 0.246089
BILL_AMT1            -0.019644
BILL_AMT2            -0.014193
BILL_AMT3            -0.014076
BILL_AMT4            -0.010156
BILL_AMT5            -0.006760
BILL_AMT6            -0.005372
PAY_AMT1             -0.072929
PAY_AMT2             -0.058579
PAY_AMT3             -0.056250
PAY_AMT4             -0.056827
PAY_AMT5             -0.055124
PAY_AMT6             -0.053183
default_pay           1.000000
Total_Missed_Pymts    0.398394
Payment_Ratio         0.398394
Total_Bill_AMT       -0.012691
Total_Payment        -0.102354
%_Amount_Paid        -0.011391
Name: default_pay, dtype: float64

In [6]:
X=df[['MARRIAGE', 'AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','Total_Missed_Pymts', 'Payment_Ratio','Total_Payment']]


In [7]:
X

Unnamed: 0,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,Total_Missed_Pymts,Payment_Ratio,Total_Payment
0,1,24,1,1,0,0,0,0,2,33.333333,689.0
1,2,26,0,1,0,0,0,1,2,33.333333,5000.0
2,2,34,0,0,0,0,0,0,0,0.000000,11018.0
3,1,37,0,0,0,0,0,0,0,0.000000,8388.0
4,1,57,0,0,0,0,0,0,0,0.000000,59049.0
...,...,...,...,...,...,...,...,...,...,...,...
29995,1,39,0,0,0,0,0,0,0,0.000000,42550.0
29996,2,43,0,0,0,0,0,0,0,0.000000,14490.0
29997,2,37,1,1,1,0,0,0,3,50.000000,31300.0
29998,1,41,1,0,0,0,0,0,1,16.666667,147181.0


In [8]:
y=df[['default_pay']]

In [9]:
y.head()

Unnamed: 0,default_pay
0,1
1,1
2,0
3,0
4,0


# Split the Data into Training and Testing

In [10]:
X.describe()

Unnamed: 0,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,Total_Missed_Pymts,Payment_Ratio,Total_Payment
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1.551867,35.4855,0.227267,0.147933,0.140433,0.117,0.098933,0.102633,0.8342,13.903333,31651.39
std,0.52197,9.217904,0.419073,0.35504,0.347442,0.321426,0.298577,0.303484,1.554303,25.905049,60827.68
min,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6679.75
50%,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14383.0
75%,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.666667,33503.5
max,3.0,79.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,3764066.0


In [11]:
# Check the balance of our target values

y['default_pay'].value_counts()

0    23364
1     6636
Name: default_pay, dtype: int64

In [12]:
# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [13]:
print(f'''% Positive class in Train ={np.round(y_train.value_counts(normalize=True)[1] *100, 2)}''')

% Positive class in Train =default_pay
1              22.08
dtype: float64


## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [14]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
scaler = StandardScaler()



In [15]:
 #Fit the Standard Scaler with the training data
 #When fitting scaling functions, only train on the training dataset
#YOUR CODE HERE
X_scaler = scaler.fit(X_train)


In [16]:
# Scale the training and testing data
# YOUR CODE HERE
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Simple Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [18]:
pickle.dump(model,open('simplelogreg.pkl', 'wb'))


In [19]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
lr_y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, lr_y_pred)

0.6170113292082383

In [20]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lr_y_pred)

array([[5564,  268],
       [1201,  467]], dtype=int64)

In [21]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8071111111111111
Testing Data Score: 0.8041333333333334


In [22]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, lr_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.95      0.28      0.88      0.52      0.29      5832
          1       0.64      0.28      0.95      0.39      0.52      0.25      1668

avg / total       0.78      0.80      0.43      0.77      0.52      0.28      7500

