# Model Development and Validation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [None]:
%matplotlib inline

## Load Data

In [None]:
data = pd.read_csv("D:\Dataset\FreddieMac\intermediate_data\integrated_data.csv",index_col=['LOAN_ID'])

data['HPI_UP_CHG'] = np.fmax(0, data.HPI_MAX / data.HPI_ORIG - 1)
data['HPI_DOWN_CHG'] = np.fmax(0, -data.HPI_MIN / data.HPI_ORIG + 1)
data['CLTV_HIGHEST'] = data.ORGN_CLTV * (data.HPI_UP_CHG + 1)
data['LOG_LOAN_SIZE'] = np.log(data['LOAN_SIZE'])

data = data[data.CHANNEL!='T']

## Data Analysis

In [None]:
def categorical_variable_analysis(df, variable_name):
    df[variable_name].value_counts().plot(kind='bar')
    plt.title('{} distribution'.format(variable_name))
    
    df[['IND_DEFAULT_2']].groupby(df[variable_name]).mean().plot()
    plt.title('{} single factor analysis'.format(variable_name))

In [None]:
def continuous_variable_analysis(df, variable_name, threshold=15):
    
    if len(df[variable_name].unique()) < threshold:
        df[variable_name].value_counts().sort_index().plot(kind='bar')
        plt.title('{} distribution'.format(variable_name))
        
        df[[variable_name,'IND_DEFAULT_2']].groupby(df[variable_name]).mean().plot(x=variable_name,y='IND_DEFAULT_2')
        plt.title('{} single factor analysis'.format(variable_name))
    else:
        df[variable_name].hist(bins=threshold)
        plt.title('{} distribution'.format(variable_name))
        
        df['{}_BIN'.format(variable_name)] = pd.qcut(df[variable_name], threshold, duplicates='drop')
        df[[variable_name,'IND_DEFAULT_2']].groupby(df['{}_BIN'.format(variable_name)]).mean().plot(x=variable_name,y='IND_DEFAULT_2')        
        plt.title('{} single factor analysis'.format(variable_name))

In [None]:
categorical_variable_analysis(data, 'FIRST_TIME_HOME_BUYER_FLAG')

In [None]:
categorical_variable_analysis(data, 'PROP_TYPE')

In [None]:
categorical_variable_analysis(data, 'LOAN_PURPOSE')

In [None]:
categorical_variable_analysis(data, 'OCCUPANCY_STATUS')

In [None]:
categorical_variable_analysis(data, 'CHANNEL')

In [None]:
categorical_variable_analysis(data, 'SUPER_CONFORMING_FLAG')

In [None]:
continuous_variable_analysis(data, 'FICO')

In [None]:
continuous_variable_analysis(data, 'MORTGAGE_INSURANCE_PCT')

In [None]:
continuous_variable_analysis(data, 'NUM_OF_UNITS')

In [None]:
continuous_variable_analysis(data, 'ORGN_CLTV')

In [None]:
continuous_variable_analysis(data, 'ORGN_LTV')

In [None]:
continuous_variable_analysis(data, 'LOG_LOAN_SIZE')

In [None]:
continuous_variable_analysis(data, 'HPI_DOWN_CHG')

In [None]:
continuous_variable_analysis(data, 'CLTV_HIGHEST')

In [None]:
continuous_variable_analysis(data, 'NUM_OF_BORROWERS')

In [None]:
continuous_variables = ['FICO','MORTGAGE_INSURANCE_PCT','NUM_OF_UNITS','ORGN_CLTV','ORGN_LTV','LOG_LOAN_SIZE','HPI_DOWN_CHG','NUM_OF_BORROWERS']
data[continuous_variables].corr().to_csv("correlation_matrix.csv")
data[continuous_variables].corr()

## Data Modification and Input Generation

In [None]:
data['NUM_OF_UNITS_SHRINK'] = np.fmin(data.NUM_OF_UNITS, 2)
data['FICO'] = np.clip(data.FICO, 600, 820)

In [None]:
continuous_variables = ['FICO','MORTGAGE_INSURANCE_PCT','NUM_OF_UNITS_SHRINK','ORGN_CLTV','LOG_LOAN_SIZE','HPI_DOWN_CHG','NUM_OF_BORROWERS']
categorical_variables = ['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE', 'OCCUPANCY_STATUS', 'CHANNEL', 'SUPER_CONFORMING_FLAG']

In [None]:
dummies_list = []
for variable in categorical_variables:
    dummies_list.append(pd.get_dummies(data[variable],prefix=variable,drop_first=True))
    
input_data = pd.concat([data[continuous_variables]] + dummies_list, axis=1)

X = input_data.values
Y = data.IND_DEFAULT_2.values
factor_names = input_data.columns

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Test Function

### KS-test

In [None]:
def ks_test(mdl_prob, act_ind):
    sorted_arg = np.argsort(mdl_prob)
    mdl_prob = mdl_prob[sorted_arg]
    act_ind = act_ind[sorted_arg]
    cum_default_rate = np.cumsum(act_ind)/np.sum(act_ind)
    cum_nondefault_rate = np.cumsum(1-act_ind)/np.sum(1-act_ind)
    max_diff = np.max(np.abs(cum_default_rate - cum_nondefault_rate))
    pd.DataFrame(np.vstack((cum_default_rate, cum_nondefault_rate)).T, columns=['default','non-default']).plot()
    print('KS stat is {}'.format(max_diff))

### Default rate Emp vs Act test

In [None]:
def emp_vs_act_test(mdl_prob, act_ind):
    sorted_arg = np.argsort(mdl_prob)
    mdl_prob = mdl_prob[sorted_arg]
    act_ind = act_ind[sorted_arg]
    max_prob = mdl_prob[-1]
    prob_range = np.linspace(0,max_prob,1001)
    mask = mdl_prob >= prob_range[:,np.newaxis]
    loan_count = mask.sum(axis=1)
    mdl_default_rate = (mdl_prob * mask).sum(axis=1)/loan_count
    act_default_rate = (act_ind * mask).sum(axis=1)/loan_count
    cmp = pd.DataFrame(np.vstack((mdl_default_rate, act_default_rate, loan_count)).T, columns=['mdl','act','loan_count'], index=prob_range)
    cmp.plot(secondary_y='loan_count')

## Variable Selection by GBM

#### Depth 1

In [None]:
params = {'n_estimators': 500, 'max_depth': 1, 'min_samples_split': 500,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
feature_importance = clf.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
fig, axe = plt.subplots(1,1,figsize=(8,10))
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, factor_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting ROC Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

#### Depth 2

In [None]:
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 500,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
feature_importance = clf.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
fig, axe = plt.subplots(1,1,figsize=(8,10))
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, factor_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

In [None]:
emp_vs_act_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting Act vs Mdl test')

#### Depth 3

In [None]:
params = {'n_estimators': 500, 'max_depth': 3, 'min_samples_split': 500,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
feature_importance = clf.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
fig, axe = plt.subplots(1,1,figsize=(8,10))
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, factor_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

In [None]:
emp_vs_act_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting Act vs Mdl test')

## Input Regeneration

In [None]:
data.loc[data.PROP_TYPE != 'PU', 'PROP_TYPE'] = 'OTH'

In [None]:
continuous_variables = ['FICO','MORTGAGE_INSURANCE_PCT','ORGN_CLTV','LOG_LOAN_SIZE','HPI_DOWN_CHG','NUM_OF_BORROWERS']
categorical_variables = ['PROP_TYPE','LOAN_PURPOSE']

In [None]:
dummies_list = []
for variable in categorical_variables:
    dummies_list.append(pd.get_dummies(data[variable],prefix=variable,drop_first=True))
    
input_data = pd.concat([data[continuous_variables]] + dummies_list, axis=1)
factor_names = input_data.columns

# Split on dataframe instead of numpy array is for backtesting convenience
train_df, test_df, train_y, test_y = train_test_split(input_data, data.IND_DEFAULT_2, test_size=0.2, random_state=0)
X_train = train_df.values
X_test = test_df.values
y_train = train_y.values
y_test = test_y.values

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model Selection

### Logistic Regression

#### Vanilla Model

In [None]:
logistic_model = linear_model.LogisticRegression(penalty='l1', C=1e5)

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
coef = logistic_model.coef_[0].tolist()
coef.append(logistic_model.intercept_[0])
pd.DataFrame(coef, index=factor_names.tolist()+['INTERCEPT'], columns=['Parameter'])

In [None]:
train_prob_lm = logistic_model.predict_proba(X_train)[:,1]
test_prob_lm = logistic_model.predict_proba(X_test)[:,1]

fpr_train_lm, tpr_train_lm, threshold_train_lm = roc_curve(y_train, train_prob_lm)
fpr_test_lm, tpr_test_lm, threshold_test_lm = roc_curve(y_test, test_prob_lm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_lm, tpr_test_lm)
plt.title('Logistic Regression Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_lm)))

In [None]:
ks_test(test_prob_lm, y_test)
plt.title('Logistic Regression KS test');

In [None]:
emp_vs_act_test(test_prob_lm, y_test)
plt.title('Logistic Regression Act vs Mdl test')

##### Visualization data generation

In [None]:
visualization_data = test_df.copy()
visualization_data.drop(['PROP_TYPE_PU','LOAN_PURPOSE_N','LOAN_PURPOSE_P'],inplace=True,axis=1)
visualization_data = visualization_data.join(data[['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE','CHANNEL','NUM_OF_UNITS','OCCUPANCY_STATUS','PROP_STATE','LOAN_PURPOSE','SUPER_CONFORMING_FLAG']],how='left')
visualization_data['ACT'] = test_y
visualization_data['MDL'] = test_prob_lm
visualization_data.to_csv('LM_1_VISUAL.csv')

#### Rare Event Model

In [None]:
default_mask = (y_train == 1)
rare_X_train_part_1 = X_train[default_mask]
rare_y_train_part_1 = y_train[default_mask]
dump1, rare_X_train_part_2, dump2, rare_y_train_part_2 = train_test_split(X_train[(~default_mask),],y_train[~default_mask], test_size=0.005)
rare_X_train = np.vstack((rare_X_train_part_1, rare_X_train_part_2))
rare_y_train = np.concatenate((rare_y_train_part_1, rare_y_train_part_2))

y_bar = y_train.mean()
tau = rare_y_train.mean()
intercept_bias_adjustment = np.log((1-tau)/tau*y_bar/(1-y_bar))

In [None]:
logistic_model.fit(rare_X_train, rare_y_train)

In [None]:
coef = logistic_model.coef_[0].tolist()
intercept = logistic_model.intercept_[0]+intercept_bias_adjustment
coef.append(intercept)
pd.DataFrame(coef, index=factor_names.tolist()+['INTERCEPT'], columns=['Parameter'])

In [None]:
lp = (X_test * logistic_model.coef_).sum(axis=1) + intercept
test_prob_lm = 1/(1+np.exp(-lp))
fpr_test_lm, tpr_test_lm, threshold_test_lm = roc_curve(y_test, test_prob_lm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_lm, tpr_test_lm)
plt.title('Logistic Regression Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_lm)))

In [None]:
ks_test(test_prob_lm, y_test)
plt.title('Logistic Regression KS test');

In [None]:
emp_vs_act_test(test_prob_lm, y_test)
plt.title('Logistic Regression Act vs Mdl test')

### Gradient Boosting

#### Depth 2; Estimator 1000; Learning rate 0.01; Min sample 200

In [None]:
params = {'n_estimators': 1000, 'max_depth': 2, 'min_samples_split': 200,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

In [None]:
emp_vs_act_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting Act vs Mdl test');

##### Visualization data generation

In [None]:
visualization_data = test_df.copy()
visualization_data.drop(['PROP_TYPE_PU','LOAN_PURPOSE_N','LOAN_PURPOSE_P'],inplace=True,axis=1)
visualization_data = visualization_data.join(data[['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE','CHANNEL','NUM_OF_UNITS','OCCUPANCY_STATUS','PROP_STATE','LOAN_PURPOSE','SUPER_CONFORMING_FLAG']],how='left')
visualization_data['ACT'] = test_y
visualization_data['MDL'] = test_prob_gbm
visualization_data.to_csv('GBM_1_VISUAL.csv')

#### Depth 2; Estimator 1000; Learning rate 0.05; Min sample 200

In [None]:
params = {'n_estimators': 1000, 'max_depth': 2, 'min_samples_split': 200,
          'learning_rate': 0.05, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

In [None]:
emp_vs_act_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting Act vs Mdl test');

##### Visualization data generation

In [None]:
visualization_data = test_df.copy()
visualization_data.drop(['PROP_TYPE_PU','LOAN_PURPOSE_N','LOAN_PURPOSE_P'],inplace=True,axis=1)
visualization_data = visualization_data.join(data[['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE','CHANNEL','NUM_OF_UNITS','OCCUPANCY_STATUS','PROP_STATE','LOAN_PURPOSE','SUPER_CONFORMING_FLAG']],how='left')
visualization_data['ACT'] = test_y
visualization_data['MDL'] = test_prob_lm
visualization_data.to_csv('GBM_2_VISUAL.csv')

#### Depth 2; Estimator 1000; Learning rate 0.01; Min sample 100

In [None]:
params = {'n_estimators': 1000, 'max_depth': 2, 'min_samples_split': 100,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)

In [None]:
test_prob_gbm = clf.predict_proba(X_test)[:,1]
fpr_test_gbm, tpr_test_gbm, threshold_test_gbm = roc_curve(y_test, test_prob_gbm)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_gbm, tpr_test_gbm)
plt.title('Gradient Boosting Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_gbm)))

In [None]:
ks_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting KS test');

In [None]:
emp_vs_act_test(test_prob_gbm, y_test)
plt.title('Gradient Boosting Act vs Mdl test');

##### Visualization data generation

In [None]:
visualization_data = test_df.copy()
visualization_data.drop(['PROP_TYPE_PU','LOAN_PURPOSE_N','LOAN_PURPOSE_P'],inplace=True,axis=1)
visualization_data = visualization_data.join(data[['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE','CHANNEL','NUM_OF_UNITS','OCCUPANCY_STATUS','PROP_STATE','LOAN_PURPOSE','SUPER_CONFORMING_FLAG']],how='left')
visualization_data['ACT'] = test_y
visualization_data['MDL'] = test_prob_lm
visualization_data.to_csv('GBM_3_VISUAL.csv')

### Neural network

In [None]:
from sklearn.neural_network import MLPClassifier

#### Layer 4 (8,6,4,2)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(8,6,4,2), random_state=1)
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 3 (7,5,3)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(7, 5, 3), random_state=1)
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 3 (8,6,4)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(8,6,4),random_state=1)
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 3 (6, 4, 2)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(6,4,2))
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 2 (6,3)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(6,3), random_state=1)
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 2 (7, 4)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(7, 4))
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

##### Visualization data generation

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
visualization_data = test_df.copy()
visualization_data.drop(['PROP_TYPE_PU','LOAN_PURPOSE_N','LOAN_PURPOSE_P'],inplace=True,axis=1)
visualization_data = visualization_data.join(data[['FIRST_TIME_HOME_BUYER_FLAG','PROP_TYPE','LOAN_PURPOSE','CHANNEL','NUM_OF_UNITS','OCCUPANCY_STATUS','PROP_STATE','LOAN_PURPOSE','SUPER_CONFORMING_FLAG']],how='left')
visualization_data['ACT'] = test_y
visualization_data['MDL'] = test_prob_nn
visualization_data.to_csv('NN_1_VISUAL.csv')

#### Layer 2 (7, 3)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(7, 3))
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');

#### Layer 2 (5, 3)

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 3))
clf.fit(X_train, y_train)

In [None]:
test_prob_nn = clf.predict_proba(X_test)[:,1]
fpr_test_nn, tpr_test_nn, threshold_test_nn = roc_curve(y_test, test_prob_nn)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_test_nn, tpr_test_nn)
plt.title('Neural Networks Roc Curve')
print('AUC is {}'.format(roc_auc_score(y_test, test_prob_nn)))

In [None]:
ks_test(test_prob_nn, y_test)
plt.title('Neural Networks KS test');

In [None]:
emp_vs_act_test(test_prob_nn, y_test)
plt.title('Neural Networks Act vs Mdl test');