In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

x = pd.read_csv('heloc_dataset_v1.csv',header = 0)
for i in range(len(x)):
    if x.iloc[i,0] == 'Good':
        x.iloc[i,0] = 1
    else:
        x.iloc[i,0] = 0


In [3]:
x = x[x.ExternalRiskEstimate!=-9]
for i in range(len(x)):
    for j in range(len(x.iloc[0])):
        if x.iloc[i,j] == -7 or x.iloc[i,j] == -8:
            x.iloc[i,j] = np.nan
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9861 entries, 0 to 10458
Data columns (total 24 columns):
RiskPerformance                       9861 non-null int64
ExternalRiskEstimate                  9861 non-null int64
MSinceOldestTradeOpen                 9622 non-null float64
MSinceMostRecentTradeOpen             9861 non-null int64
AverageMInFile                        9861 non-null int64
NumSatisfactoryTrades                 9861 non-null int64
NumTrades60Ever2DerogPubRec           9861 non-null int64
NumTrades90Ever2DerogPubRec           9861 non-null int64
PercentTradesNeverDelq                9861 non-null int64
MSinceMostRecentDelq                  5027 non-null float64
MaxDelq2PublicRecLast12M              9861 non-null int64
MaxDelqEver                           9861 non-null int64
NumTotalTrades                        9861 non-null int64
NumTradesOpeninLast12M                9861 non-null int64
PercentInstallTrades                  9861 non-null int64
MSinceMostRecentIn

In [4]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="mean")
x = pd.DataFrame(imputer.fit_transform(x),columns= x.columns)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9861 entries, 0 to 9860
Data columns (total 24 columns):
RiskPerformance                       9861 non-null float64
ExternalRiskEstimate                  9861 non-null float64
MSinceOldestTradeOpen                 9861 non-null float64
MSinceMostRecentTradeOpen             9861 non-null float64
AverageMInFile                        9861 non-null float64
NumSatisfactoryTrades                 9861 non-null float64
NumTrades60Ever2DerogPubRec           9861 non-null float64
NumTrades90Ever2DerogPubRec           9861 non-null float64
PercentTradesNeverDelq                9861 non-null float64
MSinceMostRecentDelq                  9861 non-null float64
MaxDelq2PublicRecLast12M              9861 non-null float64
MaxDelqEver                           9861 non-null float64
NumTotalTrades                        9861 non-null float64
NumTradesOpeninLast12M                9861 non-null float64
PercentInstallTrades                  9861 non-null f



In [5]:
from sklearn.neural_network import MLPClassifier
# Load X and y
X = x.iloc[:,1:24]
Y = x.iloc[:,0]

# Train Test Split
#np.random.seed(1)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Feature Scaling
#scaler = StandardScaler()
#scaler.fit(X_train)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)

In [6]:
def evaluate_model(X,Y,model, model_name, params):
    #9.1 Initialize the seed value to 1
    np.random.seed(1)
    
    #9.2 Split the data to test and train data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    
    #9.3 scale the feature values
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
   
    #9.4 hyper-parameter tuning，using model and input params to choose the best model
    clf = GridSearchCV(model, params, cv=10)
    
    #9.5 Evaluate the test error using the best classifier and the test data
    clf.fit(X_train, Y_train)
    Score = clf.score(X_test, Y_test) 
    cvres = clf.cv_results_ 
    cvscore = cvres['mean_test_score'][clf.best_index_] 
    Time = cvres['mean_fit_time'][clf.best_index_] 
    
    #9.6 return a dictionary
    d = {'Classifier': model_name, 'params':clf.best_estimator_,'Test Score': Score, 'CV Score':cvscore, 'Time':Time}
    return d
    pass

In [7]:
def init_classifiers():
    return([(SVC(), model_names[0], param_grid_svc), 
            (LogisticRegression(), model_names[1], param_grid_logistic),
            (KNeighborsClassifier(), model_names[2], param_grid_knn),
            (GaussianNB(), model_names[3], param_grid_nb),
            (DecisionTreeClassifier(), model_names[4], param_grid_tree),
            (RandomForestClassifier(), model_names[6], param_grid_rf),
            (AdaBoostClassifier(), model_names[7], param_grid_boost),
            (MLPClassifier(),model_names[8],param_grid_MLP)])

# 'model_names' contains the names  that we will use for the above classifiers
model_names = ['SVM','LR','KNN','NB','Tree','QDA','RF','Boosting','MLP']

# the training parameters of each model
param_grid_svc = [{'C':[0.1,1],'kernel':['rbf','linear','poly','sigmoid'], 'max_iter':[-1],'random_state':[1]}]
param_grid_logistic = [{'C':[0.1,1], 'penalty':['l1','l2'],'random_state':[1]}]
param_grid_knn = [{},{'n_neighbors':list(range(1,31))}]
param_grid_nb = [{}]
param_grid_tree = [{'random_state':[1]},{'criterion':['gini'], 'max_depth':list(range(2,10)), 'min_samples_split':[3,5],'random_state':[1]}]
param_grid_rf = [{'random_state':[1]},{'n_estimators':[10,20,30],'max_features':[0.2, 0.3], 'bootstrap':[True],'random_state':[1]}]
param_grid_boost = [{'random_state':[1]},{'n_estimators':[10,20,30],'learning_rate':[0.1,1],'random_state':[1]}]
param_grid_MLP = [{'solver':['lbfgs', 'sgd', 'adam'],'random_state':[1],'activation':['identity', 'logistic', 'tanh', 'relu'],'learning_rate':['constant', 'invscaling', 'adaptive'],'alpha':[0.1,1]}]

In [8]:
res_list = []
classifiers = init_classifiers()
for i in classifiers:
    results = evaluate_model(X, Y, i[0], i[1], i[2])
    res_list.append(results)

df_model_comparison = pd.DataFrame(res_list).sort_values(['Classifier']).reset_index(drop=True)
df_model_comparison

















Unnamed: 0,CV Score,Classifier,Test Score,Time,params
0,0.726437,Boosting,0.71249,0.283058,"(DecisionTreeClassifier(class_weight=None, cri..."
1,0.731034,KNN,0.70438,0.013717,"KNeighborsClassifier(algorithm='auto', leaf_si..."
2,0.736038,LR,0.715734,0.034771,"LogisticRegression(C=0.1, class_weight=None, d..."
3,0.740771,MLP,0.714923,3.846543,"MLPClassifier(activation='relu', alpha=1, batc..."
4,0.711562,NB,0.697486,0.006743,"GaussianNB(priors=None, var_smoothing=1e-09)"
5,0.726842,RF,0.71249,0.385273,"(DecisionTreeClassifier(class_weight=None, cri..."
6,0.73712,SVM,0.710868,1.878212,"SVC(C=1, cache_size=200, class_weight=None, co..."
7,0.714943,Tree,0.702758,0.014868,"DecisionTreeClassifier(class_weight=None, crit..."


In [10]:
df_model_comparison['params'][3]

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
## interface design
import streamlit as st
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

x = pd.read_csv('heloc_dataset_v1.csv',header = 0)
for i in range(len(x)):
    if x.iloc[i,0] == 'Good':
        x.iloc[i,0] = 1
    else:
        x.iloc[i,0] = 0
x = x[x.ExternalRiskEstimate != -9]
for i in range(len(x)):
    for k in range(len(x.loc[0])):
        if x.iloc[i,k] == -7 or x.iloc[i,k] == -8:
            x.iloc[i,k] = np.nan
from sklearn.preprocessing import Imputer
Imputer = Imputer(strategy = 'mean')
x = Imputer.fit_transform(x)
x = pd.DataFrame(x)
np.random.seed(1)
# Load X and y
X = x.iloc[:,1:24]
Y = x.iloc[:,0]
    
# Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    
# Feature Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Title
st.title('Credit Risk Assessment')

# Header
st.header('Introduction Sample')

# Header
st.header('Information Collection')

# Checkbox
ExternalRiskEstimate = st.slider("External Risk Estimate",0,100)

MSinceOldestTradeOpen=st.slider("Months Since Oldest Trade Open",0,810)
    
MSinceMostRecentTradeOpen=st.slider("Months Since Most Recent Trade Open",0,400)
    
AverageMInFile=st.slider("Average Months in File",0,400)
   
NumSatisfactoryTrades= st.slider("Satisfactory Trades Number",0,80)
    
NumTrades60Ever2DerogPubRec=st.slider("Trades 60+ Ever",0,20)
    
NumTrades90Ever2DerogPubRec=st.slider("Trades 90+ Ever",0,20)
    
NumTotalTrades=st.slider("Total Number of Credit Accounts",0,110)

PercentInstallTrades=st.slider('Percent Installment Trades',0,110)
    
NumTradesOpeninLast12M=st.slider("Number of Trades Open in Last 12 Months",0,20)

PercentTradesNeverDelq=st.slider("Percent Trades Never Delinquent",0,100)
   
MSinceMostRecentDelq=st.slider("Months Since Most Recent Delinquent",0,90)
   
MaxDelq2PublicRecLast12M=st.slider("Max Delq/Public Records Last 12 Months",0,7)
    
MaxDelqEver=st.slider("Max Delinquency Ever",0,8)

MSinceMostRecentInqexcl7days=st.slider("Months Since Most Recent Inquiry excl 7days",0,30)
    
NumInqLast6M=st.slider("Number of Inquiry Last 6 Months",0,70)
    
NumInqLast6Mexcl7days=st.slider("Number of Inquiry Last 6 Months excl 7days",0,70)

NetFractionRevolvingBurden=st.slider("Net Fraction Revolving Burden",0,240)
    
NetFractionInstallBurden=st.slider('Net Fraction Install Burden',0,480)

NumRevolvingTradesWBalance=st.slider("Revolving Trades with Balance",0,40)

NumInstallTradesWBalance=st.slider('Installment Trades with Balance Number',0,25)

NumBank2NatlTradesWHighUtilization=st.slider("Bank/National Trades with high utilization ratio Number",0,20)

PercentTradesWBalance=st.slider("Percent Trades with Balance",0,100)

#Using trained model to predict through the arrary of above features
MLP = MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
MLP.fit(X_train,Y_train)
res = MLP.predict(np.array([ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,MSinceMostRecentTradeOpen, NumSatisfactoryTrades, NumTrades60Ever2DerogPubRec, NumTrades90Ever2DerogPubRec, NumTotalTrades, PercentInstallTrades, NumTradesOpeninLast12M, PercentTradesNeverDelq, MSinceMostRecentDelq, MaxDelq2PublicRecLast12M, MaxDelqEver, MSinceMostRecentInqexcl7days, NumInqLast6M, NumInqLast6Mexcl7days, NetFractionRevolvingBurden, NetFractionInstallBurden, NumRevolvingTradesWBalance, NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance]).reshape(1,-1))

if res == 0:
    st.write('Prediction: ', 'Reject')
else:
    st.write('Prediction: ', 'Accept')