In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from keras.callbacks import EarlyStopping

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import skew

In [None]:
df = pd.read_csv('loan_level_500k.csv')
df.head()

In [None]:
df.drop('LOAN_SEQUENCE_NUMBER', inplace=True, axis =1)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

Dropping irrelevant columns (Not Available during prediction)

In [None]:
df.drop(["FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "ORIGINAL_UPB", "ORIGINAL_INTEREST_RATE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG"], inplace=True, axis=1)

In [None]:
print(df.isnull().sum().sort_values())

In [None]:
def missing_percentage(df):
    missing = pd.DataFrame(columns=['Category', 'Percentage'])
    for col in df.columns:
        if df[col].isna().values.any():
            percentage = 100*df[col].isna().sum()/df.shape[0]
            missing = missing.append({'Category':col, 'Percentage':percentage}, ignore_index = True)
    return missing

In [None]:
missingdata = missing_percentage(df)
missingdata.sort_values('Percentage', ascending=False)

In [None]:
# plt.figure(figsize=(20,20))
# sns.heatmap(df.isnull(), cmap='viridis')

In [None]:
df.FIRST_TIME_HOMEBUYER_FLAG.value_counts()

In [None]:
# plt.figure(figsize=(70,100))
#
# for i,col in enumerate(df):
#     plt.subplot(10,3,i+1)
#     sns.countplot(data=df, x=col, hue='DELINQUENT')

In [None]:
# plt.figure(figsize=(15,10))
# sns.heatmap(df.corr(), annot = True)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['DELINQUENT']= label_encoder.fit_transform(df['DELINQUENT'])
df['PREPAID']= label_encoder.fit_transform(df['PREPAID'])
df['POSTAL_CODE'] = label_encoder.fit_transform(df['POSTAL_CODE'])
df['FIRST_TIME_HOMEBUYER_FLAG'] = label_encoder.fit_transform(df['FIRST_TIME_HOMEBUYER_FLAG'])

In [None]:
from category_encoders import TargetEncoder

In [None]:
mylist = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE',
          'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']

In [None]:
te = TargetEncoder()
def trgenc(df, col):
    df_fit = te.fit_transform(df[col], df['DELINQUENT'])
    return df_fit.join(df.drop(columns=col, axis=1))

for i in range(0,8):
    df = trgenc(df, mylist[i])


In [None]:
col = df.columns
col

In [None]:
SI = SimpleImputer(strategy='most_frequent')
df = SI.fit_transform(df)
df = pd.DataFrame(df, columns=col)

In [None]:
df.isnull().sum()

In [None]:
X = df.iloc[:,:-1]
y = df['DELINQUENT']

In [None]:
y.value_counts().to_frame().T

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y)

In [None]:
y_resampled.value_counts().to_frame().T

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thres = VarianceThreshold(threshold=0)
var_thres.fit(X_train)

In [None]:
sum(var_thres.get_support())

In [None]:
constant_col = [column for column in X_train.columns
                if column not in X_train.columns[var_thres.get_support()]]

In [None]:
X_train.drop(constant_col, axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
model = keras.Sequential(
    [
        keras.layers.Dense(units=9, activation="relu", input_shape=(X_train.shape[-1],) ),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(units=9, activation="relu"),
        keras.layers.Dense(units=1, activation="sigmoid"),
    ]
)

initial_weights = model.get_weights()

In [None]:
model.summary()

In [None]:
learning_rate = 0.001
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
              loss = "binary_crossentropy",
              metrics=keras.metrics.AUC()
              )

In [239]:
history = model.fit(X_train, y_train,
                    epochs=500,
                    batch_size=1000,
                    verbose=0)


KeyboardInterrupt



In [None]:
logs = pd.DataFrame(history.hostory)

plt.figure(figsize=(14,4))
plt.subplot(1,2,1)
plt.plot(logs.loc[5:,"loss"], lw=2, label='training loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.subplot(1,2,2)
plt.plot(logs.loc[5:,"auc"], lw=2, label='Training Roc AUC score')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()