In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from imblearn.under_sampling import TomekLinks

In [3]:
# loading in the dataframe
pdf = pd.read_csv('combined_dataframes.csv')
pdf = pdf.drop(['Unnamed: 0'], axis=1)

# Retaining only rows with 0 or 1 for defaut status
ndf = pdf.loc[pdf['default_status'].isin(['0','1'])]
ndf.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,loan_id,mon_rep_dt,UPB_x,default_status,loan_age,credit_score,1st_pay_dt,1st_home,maturity_dt,MI%,...,channel,prod_type,state,home_type,zip_code,purpose,loan_term,no_borrowers,seller,servicer
0,F115Q1000001,201610,0.0,0,18,796,201505,9,203004,0,...,R,FRM,IA,SF,51000,C,180,1,Other sellers,Other servicers
1,F115Q1000002,201709,100534.98,0,30,805,201504,9,204503,0,...,B,FRM,NE,SF,68500,N,360,1,Other sellers,Other servicers
2,F115Q1000003,201709,320369.23,0,31,730,201503,9,203002,0,...,R,FRM,KY,SF,40400,N,180,2,Other sellers,NATIONSTARMTGELLCDBA
3,F115Q1000004,201709,281120.77,0,29,762,201505,9,204504,0,...,R,FRM,CO,SF,81200,N,360,2,Other sellers,USBANKNA
4,F115Q1000005,201709,183054.42,0,29,777,201505,9,204504,0,...,R,FRM,IL,SF,61700,N,360,1,Other sellers,Other servicers


## Pipelining

In [4]:
# dropping two categorical variables that are not required
# ndf.drop(['seller', 'servicer'], axis=1)
del ndf['seller']
del ndf['servicer']
ndf.head()

Unnamed: 0,loan_id,mon_rep_dt,UPB_x,default_status,loan_age,credit_score,1st_pay_dt,1st_home,maturity_dt,MI%,...,LTV,interest,channel,prod_type,state,home_type,zip_code,purpose,loan_term,no_borrowers
0,F115Q1000001,201610,0.0,0,18,796,201505,9,203004,0,...,72,2.625,R,FRM,IA,SF,51000,C,180,1
1,F115Q1000002,201709,100534.98,0,30,805,201504,9,204503,0,...,79,3.875,B,FRM,NE,SF,68500,N,360,1
2,F115Q1000003,201709,320369.23,0,31,730,201503,9,203002,0,...,73,2.875,R,FRM,KY,SF,40400,N,180,2
3,F115Q1000004,201709,281120.77,0,29,762,201505,9,204504,0,...,70,3.375,R,FRM,CO,SF,81200,N,360,2
4,F115Q1000005,201709,183054.42,0,29,777,201505,9,204504,0,...,80,3.625,R,FRM,IL,SF,61700,N,360,1


In [5]:
# keeping only CALIFORNIA observations in data - to model using a subset of original data 
state_df = ndf.loc[ndf['state'] == 'CA']

# resetting index to start from 0
state_df = state_df.reset_index(drop=True)
state_df.head()

Unnamed: 0,loan_id,mon_rep_dt,UPB_x,default_status,loan_age,credit_score,1st_pay_dt,1st_home,maturity_dt,MI%,...,LTV,interest,channel,prod_type,state,home_type,zip_code,purpose,loan_term,no_borrowers
0,F115Q1000053,201512,0.0,0,10,778,201503,9,204502,25,...,90,4.25,R,FRM,CA,SF,96000,N,360,2
1,F115Q1000086,201709,67989.83,0,30,745,201504,9,204503,0,...,52,4.25,R,FRM,CA,SF,95900,C,360,2
2,F115Q1000131,201709,72405.01,0,30,774,201504,9,204503,0,...,73,4.25,R,FRM,CA,SF,93600,N,360,2
3,F115Q1000174,201709,152368.91,0,31,804,201503,9,204502,0,...,45,4.25,R,FRM,CA,SF,95900,N,360,1
4,F115Q1000202,201709,129933.06,0,31,820,201503,9,204502,0,...,20,3.875,R,FRM,CA,SF,91700,N,360,1


In [6]:
# creating dummy variables for categorical features in data
ddf = pd.get_dummies(state_df, prefix=['1st_home', 'occupancy', 'channel', 'prod_type', 'home_type', 'purpose'],
                     columns=['1st_home', 'occupancy', 'channel', 'prod_type', 'home_type', 'purpose'])
ddf.head()

Unnamed: 0,loan_id,mon_rep_dt,UPB_x,default_status,loan_age,credit_score,1st_pay_dt,maturity_dt,MI%,num_units,...,channel_R,prod_type_FRM,home_type_CO,home_type_CP,home_type_MH,home_type_PU,home_type_SF,purpose_C,purpose_N,purpose_P
0,F115Q1000053,201512,0.0,0,10,778,201503,204502,25,1,...,1,1,0,0,0,0,1,0,1,0
1,F115Q1000086,201709,67989.83,0,30,745,201504,204503,0,1,...,1,1,0,0,0,0,1,1,0,0
2,F115Q1000131,201709,72405.01,0,30,774,201504,204503,0,1,...,1,1,0,0,0,0,1,0,1,0
3,F115Q1000174,201709,152368.91,0,31,804,201503,204502,0,1,...,1,1,0,0,0,0,1,0,1,0
4,F115Q1000202,201709,129933.06,0,31,820,201503,204502,0,1,...,1,1,0,0,0,0,1,0,1,0


In [7]:
# dropping date features - since not useful
ddf.drop(['mon_rep_dt', '1st_pay_dt', 'maturity_dt'], axis=1).head()

Unnamed: 0,loan_id,UPB_x,default_status,loan_age,credit_score,MI%,num_units,CLTV,DTI,UPB_y,...,channel_R,prod_type_FRM,home_type_CO,home_type_CP,home_type_MH,home_type_PU,home_type_SF,purpose_C,purpose_N,purpose_P
0,F115Q1000053,0.0,0,10,778,25,1,90,49,260000,...,1,1,0,0,0,0,1,0,1,0
1,F115Q1000086,67989.83,0,30,745,0,1,52,38,84000,...,1,1,0,0,0,0,1,1,0,0
2,F115Q1000131,72405.01,0,30,774,0,1,73,46,76000,...,1,1,0,0,0,0,1,0,1,0
3,F115Q1000174,152368.91,0,31,804,0,1,45,26,160000,...,1,1,0,0,0,0,1,0,1,0
4,F115Q1000202,129933.06,0,31,820,0,1,20,14,137000,...,1,1,0,0,0,0,1,0,1,0


In [8]:
# Pre-processing
X = ddf.drop(['loan_id', 'default_status', 'state'], axis=1)  
y = ddf['default_status']

In [9]:
# Test-Train split
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## 1. Logistic Regression

In [10]:
# Training
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

# Prediction
predictions = logmodel.predict(X_test)
predictions_training = logmodel.predict(X_train)



In [11]:
print(classification_report(y_train,predictions_training))

NameError: name 'classification_report' is not defined

In [12]:
# Evaluation - 1
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     97746
           1       0.00      0.00      0.00       427

   micro avg       1.00      1.00      1.00     98173
   macro avg       0.50      0.50      0.50     98173
weighted avg       0.99      1.00      0.99     98173



In [13]:
# Feature scaling
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  
  import sys


In [14]:
# Training
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

# Prediction
predictions = logmodel.predict(X_test)

# Evaluation
print(classification_report(y_test,predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     97746
           1       0.00      0.00      0.00       427

   micro avg       1.00      1.00      1.00     98173
   macro avg       0.50      0.50      0.50     98173
weighted avg       0.99      1.00      0.99     98173



In [15]:
# Accuracy
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logmodel.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 1.00


In [16]:
from collections import Counter

## 2. KNN

In [None]:
# Training
# from sklearn.neighbors import KNeighborsClassifier  
# classifier = KNeighborsClassifier(n_neighbors=5)  
# classifier.fit(X_train, y_train)

# # Prdiction
# y_pred = classifier.predict(X_test)

In [None]:
# Evaluation
# from sklearn.metrics import classification_report, confusion_matrix  
# print(confusion_matrix(y_test, y_pred))  
# print(classification_report(y_test, y_pred))

# Conclusion

# Next

1. choose resampling method - 3/4 under and oversampling each
2. choose class algo - RF , XGBoost

In [None]:
1.Train-test split
2. For given resampling method and classification algo do the following:
    a. Resample training set to create new training set
    b. Train classificaton algo with training set from a, possibly using hyper param tuning
3. Get classification report for model trained in 2 using test set
4. Repeat for eery combo of resampling method and classificaton algo
5. Create a matrix/table containng all classification report so methods can be compared against each other

In [None]:
US:Random undersampling
    
OS:
SMOTE
adasyn

## 1. Undersampling

### Random Under Sampling

In [18]:
from imblearn.under_sampling import RandomUnderSampler

In [21]:
# RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X, y)

In [22]:
print('Random undersampling {}'.format(Counter(y_resampled)))

Random undersampling Counter({'0': 1439, '1': 1439})


### Logistic Regrssion

In [23]:
# Training
logmodel = LogisticRegression()
logmodel.fit(X_resampled,y_resampled)

# Prediction
predictions = logmodel.predict(X_test)

# Evaluation
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

           0       1.00      0.51      0.68     97746
           1       0.01      0.83      0.01       427

   micro avg       0.51      0.51      0.51     98173
   macro avg       0.50      0.67      0.35     98173
weighted avg       0.99      0.51      0.68     98173



### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Training
clf=RandomForestClassifier()
# n_estimators=100
clf.fit(X_resampled,y_resampled)

# Prediction
predictions=clf.predict(X_test)

# Evaluation
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

           0       1.00      0.92      0.96     97746
           1       0.00      0.07      0.01       427

   micro avg       0.92      0.92      0.92     98173
   macro avg       0.50      0.49      0.48     98173
weighted avg       0.99      0.92      0.95     98173



In [26]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

Accuracy: 0.9169527263096778


## 2. Oversampling

### (i) SMOTE

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
sm = SMOTE(random_state=42)

X_res, y_res = sm.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'0': 325804, '1': 325804})


### Logistic Regression

In [29]:
# Training
logmodel = LogisticRegression()
logmodel.fit(X_res,y_res)

# Prediction
predictions = logmodel.predict(X_test)

# Evaluation
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

           0       1.00      0.49      0.66     97746
           1       0.01      0.82      0.01       427

   micro avg       0.49      0.49      0.49     98173
   macro avg       0.50      0.65      0.34     98173
weighted avg       0.99      0.49      0.65     98173



### Random Forest

In [30]:
# Training
clf=RandomForestClassifier()
# n_estimators=100
clf.fit(X_res,y_res)

# Prediction
predictions=clf.predict(X_test)

# Evaluation
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

           0       1.00      0.44      0.61     97746
           1       0.00      0.59      0.01       427

   micro avg       0.44      0.44      0.44     98173
   macro avg       0.50      0.51      0.31     98173
weighted avg       0.99      0.44      0.61     98173



In [31]:
#Import scikit-learn metrics module for accuracy calculation
# from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

Accuracy: 0.4423619528791012


### (ii) ADASYN

In [None]:
from imblearn.over_sampling import ADASYN

In [34]:
X_resampled, y_resampled = ADASYN().fit_resample(X, y)

# print('Resampled dataset shape %s' % Counter(y_resampled))

# print(sorted(Counter(y_resampled).items())

NameError: name 'ADASYN' is not defined

--------------------------------------------------------------------------------------------------------------------------

In [None]:
# dropping the following 3 categorical variables with too many categories (alpha and alphanumeric)
# ddf = ndf.drop('state', axis=1)
# del ndf['state']
# ndf.head()

In [None]:
# with above 3 columns removed
# ndf.head()

In [None]:
# print(ndf.isnull().values.sum())

In [None]:
# checking the list of columns with missing values
# ndf.columns[ndf.isnull().any()]

No column has missing values in dataframe ndf

Now, we'll create a new dataframe with all the categorical variables from ndf converted to dummy variables. We'll call it ddf

In [None]:
# ndf.columns

In [None]:
# checking categorical variables (type = object)
# ndf.dtypes

In [None]:
# ddf = pd.get_dummies(ndf, prefix=['1st_home', 'occupancy', 'channel', 'prod_type', 'home_type', 'purpose'],
#                      columns=['1st_home', 'occupancy', 'channel', 'prod_type', 'home_type', 'purpose'])
# ddf.head()

In [None]:
# ddf.drop(['mon_rep_dt', '1st_pay_dt', 'maturity_dt'], axis=1).head()

### Pre-processing

In [None]:
# X = ddf.drop(['loan_id', 'default_status'], axis=1)  
# y = ddf['default_status']

### Train-Test Split

In [None]:
# from sklearn.model_selection import train_test_split  
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)