In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from keras.callbacks import EarlyStopping

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import skew

In [4]:
df = pd.read_csv('loan_level_500k.csv')
df.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,METROPOLITAN_STATISTICAL_AREA,MORTGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORIGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,PROPERTY_TYPE,POSTAL_CODE,LOAN_SEQUENCE_NUMBER,LOAN_PURPOSE,ORIGINAL_LOAN_TERM,NUMBER_OF_BORROWERS,SELLER_NAME,SERVICER_NAME,PREPAID,DELINQUENT
0,669.0,200206,N,202901,,0.0,1.0,O,80.0,33.0,...,SF,26100.0,F199Q1000004,P,320,2.0,Other sellers,Other servicers,True,False
1,732.0,199904,N,202903,17140.0,0.0,1.0,O,25.0,10.0,...,SF,45200.0,F199Q1000005,N,360,1.0,Other sellers,Other servicers,True,False
2,679.0,200208,N,202902,15940.0,30.0,1.0,O,91.0,48.0,...,SF,44700.0,F199Q1000007,P,319,1.0,Other sellers,Other servicers,True,False
3,721.0,200209,N,202902,38060.0,0.0,1.0,O,39.0,13.0,...,SF,85200.0,F199Q1000013,N,318,2.0,Other sellers,Other servicers,True,False
4,618.0,200210,N,202902,10420.0,25.0,1.0,O,85.0,24.0,...,SF,44200.0,F199Q1000015,N,317,2.0,Other sellers,Other servicers,True,False


In [5]:
df.drop('LOAN_SEQUENCE_NUMBER', inplace=True, axis =1)

In [6]:
df.shape

(500137, 26)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500137 entries, 0 to 500136
Data columns (total 26 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   CREDIT_SCORE                      497426 non-null  float64
 1   FIRST_PAYMENT_DATE                500137 non-null  int64  
 2   FIRST_TIME_HOMEBUYER_FLAG         369578 non-null  object 
 3   MATURITY_DATE                     500137 non-null  int64  
 4   METROPOLITAN_STATISTICAL_AREA     429988 non-null  float64
 5   MORTGAGE_INSURANCE_PERCENTAGE     449089 non-null  float64
 6   NUMBER_OF_UNITS                   500134 non-null  float64
 7   OCCUPANCY_STATUS                  500137 non-null  object 
 8   ORIGINAL_COMBINED_LOAN_TO_VALUE   500124 non-null  float64
 9   ORIGINAL_DEBT_TO_INCOME_RATIO     485208 non-null  float64
 10  ORIGINAL_UPB                      500137 non-null  int64  
 11  ORIGINAL_LOAN_TO_VALUE            500128 non-null  f

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CREDIT_SCORE,497426.0,712.536212,54.791262,300.0,676.0,719.0,756.0,839.0
FIRST_PAYMENT_DATE,500137.0,200025.430952,109.815541,199901.0,199904.0,200005.0,200105.0,201103.0
MATURITY_DATE,500137.0,203023.195872,110.384189,202402.0,202903.0,203004.0,203104.0,204101.0
METROPOLITAN_STATISTICAL_AREA,429988.0,30777.824739,11333.401144,10180.0,19740.0,33340.0,40420.0,49740.0
MORTGAGE_INSURANCE_PERCENTAGE,449089.0,7.744532,12.046546,0.0,0.0,0.0,18.0,55.0
NUMBER_OF_UNITS,500134.0,1.02889,0.218391,1.0,1.0,1.0,1.0,4.0
ORIGINAL_COMBINED_LOAN_TO_VALUE,500124.0,76.053571,15.139986,6.0,70.0,80.0,88.0,180.0
ORIGINAL_DEBT_TO_INCOME_RATIO,485208.0,32.917541,11.1118,1.0,25.0,33.0,41.0,65.0
ORIGINAL_UPB,500137.0,136493.484785,60968.743066,8000.0,89000.0,126000.0,176000.0,578000.0
ORIGINAL_LOAN_TO_VALUE,500128.0,75.710714,14.937717,6.0,70.0,80.0,85.0,100.0


Dropping irrelevant columns (Not Available during prediction)

In [9]:
df.drop(["FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "ORIGINAL_UPB", "ORIGINAL_INTEREST_RATE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG"], inplace=True, axis=1)

In [10]:
print(df.isnull().sum().sort_values())

PRODUCT_TYPE                            0
SERVICER_NAME                           0
SELLER_NAME                             0
ORIGINAL_LOAN_TERM                      0
LOAN_PURPOSE                            0
PROPERTY_STATE                          0
PREPAID                                 0
CHANNEL                                 0
DELINQUENT                              0
OCCUPANCY_STATUS                        0
NUMBER_OF_UNITS                         3
ORIGINAL_LOAN_TO_VALUE                  9
ORIGINAL_COMBINED_LOAN_TO_VALUE        13
POSTAL_CODE                            31
PROPERTY_TYPE                          95
NUMBER_OF_BORROWERS                   247
CREDIT_SCORE                         2711
ORIGINAL_DEBT_TO_INCOME_RATIO       14929
METROPOLITAN_STATISTICAL_AREA       70149
FIRST_TIME_HOMEBUYER_FLAG          130559
dtype: int64


In [11]:
def missing_percentage(df):
    missing = pd.DataFrame(columns=['Category', 'Percentage'])
    for col in df.columns:
        if df[col].isna().values.any():
            percentage = 100*df[col].isna().sum()/df.shape[0]
            missing = missing.append({'Category':col, 'Percentage':percentage}, ignore_index = True)
    return missing

In [12]:
missingdata = missing_percentage(df)
missingdata.sort_values('Percentage', ascending=False)

Unnamed: 0,Category,Percentage
1,FIRST_TIME_HOMEBUYER_FLAG,26.104647
2,METROPOLITAN_STATISTICAL_AREA,14.025957
5,ORIGINAL_DEBT_TO_INCOME_RATIO,2.984982
0,CREDIT_SCORE,0.542051
9,NUMBER_OF_BORROWERS,0.049386
7,PROPERTY_TYPE,0.018995
8,POSTAL_CODE,0.006198
4,ORIGINAL_COMBINED_LOAN_TO_VALUE,0.002599
6,ORIGINAL_LOAN_TO_VALUE,0.0018
3,NUMBER_OF_UNITS,0.0006


In [13]:
# plt.figure(figsize=(20,20))
# sns.heatmap(df.isnull(), cmap='viridis')

In [14]:
df.FIRST_TIME_HOMEBUYER_FLAG.value_counts()

N    320418
Y     49160
Name: FIRST_TIME_HOMEBUYER_FLAG, dtype: int64

In [15]:
# plt.figure(figsize=(70,100))
#
# for i,col in enumerate(df):
#     plt.subplot(10,3,i+1)
#     sns.countplot(data=df, x=col, hue='DELINQUENT')

In [16]:
# plt.figure(figsize=(15,10))
# sns.heatmap(df.corr(), annot = True)

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['DELINQUENT']= label_encoder.fit_transform(df['DELINQUENT'])
df['PREPAID']= label_encoder.fit_transform(df['PREPAID'])
df['POSTAL_CODE'] = label_encoder.fit_transform(df['POSTAL_CODE'])
df['FIRST_TIME_HOMEBUYER_FLAG'] = label_encoder.fit_transform(df['FIRST_TIME_HOMEBUYER_FLAG'])

In [18]:
from category_encoders import TargetEncoder

In [19]:
mylist = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE',
          'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']

In [20]:
te = TargetEncoder()
def trgenc(df, col):
    df_fit = te.fit_transform(df[col], df['DELINQUENT'])
    return df_fit.join(df.drop(columns=col, axis=1))

for i in range(0,8):
    df = trgenc(df, mylist[i])


In [21]:
col = df.columns
col

Index(['SERVICER_NAME', 'SELLER_NAME', 'LOAN_PURPOSE', 'PROPERTY_TYPE',
       'PROPERTY_STATE', 'PRODUCT_TYPE', 'CHANNEL', 'OCCUPANCY_STATUS',
       'CREDIT_SCORE', 'FIRST_TIME_HOMEBUYER_FLAG',
       'METROPOLITAN_STATISTICAL_AREA', 'NUMBER_OF_UNITS',
       'ORIGINAL_COMBINED_LOAN_TO_VALUE', 'ORIGINAL_DEBT_TO_INCOME_RATIO',
       'ORIGINAL_LOAN_TO_VALUE', 'POSTAL_CODE', 'ORIGINAL_LOAN_TERM',
       'NUMBER_OF_BORROWERS', 'PREPAID', 'DELINQUENT'],
      dtype='object')

In [22]:
SI = SimpleImputer(strategy='most_frequent')
df = SI.fit_transform(df)
df = pd.DataFrame(df, columns=col)

In [23]:
df.isnull().sum()

SERVICER_NAME                      0
SELLER_NAME                        0
LOAN_PURPOSE                       0
PROPERTY_TYPE                      0
PROPERTY_STATE                     0
PRODUCT_TYPE                       0
CHANNEL                            0
OCCUPANCY_STATUS                   0
CREDIT_SCORE                       0
FIRST_TIME_HOMEBUYER_FLAG          0
METROPOLITAN_STATISTICAL_AREA      0
NUMBER_OF_UNITS                    0
ORIGINAL_COMBINED_LOAN_TO_VALUE    0
ORIGINAL_DEBT_TO_INCOME_RATIO      0
ORIGINAL_LOAN_TO_VALUE             0
POSTAL_CODE                        0
ORIGINAL_LOAN_TERM                 0
NUMBER_OF_BORROWERS                0
PREPAID                            0
DELINQUENT                         0
dtype: int64

In [24]:
X = df.iloc[:,:-1]
y = df['DELINQUENT']

In [25]:
y.value_counts().to_frame().T

Unnamed: 0,0.0,1.0
DELINQUENT,482146,17991


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [27]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [28]:
y_resampled.value_counts().to_frame().T

Unnamed: 0,0.0,1.0
DELINQUENT,337470,337470


In [29]:
from sklearn.feature_selection import VarianceThreshold

var_thres = VarianceThreshold(threshold=0)
var_thres.fit(X_resampled)

In [30]:
sum(var_thres.get_support())

18

In [31]:
constant_col = [column for column in X_train.columns
                if column not in X_train.columns[var_thres.get_support()]]

In [32]:
X_train.drop(constant_col, axis=1, inplace=True)

In [33]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_resampled = scaler.fit_transform(X_resampled)

In [34]:
# model = keras.Sequential(
#     [
#
#     ]
# )
#
# initial_weights = model.get_weights()

In [35]:
# model.summary()

In [36]:
# learning_rate = 0.001
# model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
#               loss = "binary_crossentropy",
#               metrics=keras.metrics.AUC()
#               )

In [37]:
# history = model.fit(X_resampled, y_resampled,
#                     epochs=20,
#                     batch_size=1000,
#                     verbose=0)

In [38]:
# logs = pd.DataFrame(history.history)
#
# plt.figure(figsize=(14,4))
# plt.subplot(1,2,1)
# plt.plot(logs.loc[5:,"loss"], lw=2, label='Training loss')
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.subplot(1,2,2)
# plt.plot(logs.loc[5:,"auc"], lw=2, label='Training Roc AUC score')
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.show()

In [39]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [40]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [42]:
des = DecisionTreeClassifier()
des.fit(X_resampled, y_resampled)


(DecisionTreeClassifier(),
 array([[0.94853517, 0.18217662, 0.        , ..., 0.96721311, 0.        ,
         0.        ],
        [0.05576487, 0.18217662, 0.79885321, ..., 0.96721311, 1.        ,
         1.        ],
        [0.20822432, 0.31673739, 0.79885321, ..., 0.96721311, 1.        ,
         1.        ],
        ...,
        [0.41295701, 0.26058122, 0.        , ..., 0.96721311, 0.        ,
         0.        ],
        [0.53029879, 0.2611337 , 0.79935461, ..., 0.96721311, 1.        ,
         0.20064539],
        [0.41496257, 0.29849819, 0.09758785, ..., 0.96721311, 0.        ,
         0.12215992]]),
         SERVICER_NAME  SELLER_NAME  LOAN_PURPOSE  PROPERTY_TYPE  \
 449016       0.043020     0.060503      0.032525       0.038516   
 205439       0.029206     0.025874      0.032525       0.021925   
 246451       0.030501     0.034089      0.039414       0.038516   
 374550       0.081589     0.034089      0.032525       0.038516   
 220131       0.011764     0.027771      0