# **Automating the credit approval process using Machine Learning**
## RSM316
## Professor Kan
## 07 April 2020
##### Ramis Najam


### Import the relevant packages 

In [None]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer

  import pandas.util.testing as tm


### Read the original data and split it into two sets, 70% of the data is for training, 30% of the data is for validation (you can have a different split)

In [None]:
warnings.filterwarnings('ignore')
original_data = pd.read_csv('train_data.csv',index_col=0)
# Split the original data into two subsets, eval_data is used for model comparison
train_data, valid_data = train_test_split(original_data, test_size=0.3, random_state=0, shuffle=True)
train_data.index = range(len(train_data))

## Naive feature engineering

In [None]:
train_data = pd.concat([train_data, valid_data], axis=0)
train_data.index = range(len(train_data))
train_data.loc[train_data['OCCUPATION']!=1,'OCCUPATION'] = 0
cat_vars = ['MARRIAGE', 'EDUCATION']
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] 
encoded_tr = [encoders[i].fit_transform(train_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]
X = pd.concat([train_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                     pd.DataFrame(np.concatenate(encoded_tr, axis=1))], axis=1)
y = train_data.iloc[:,-1] 
X = X.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})
X = X.drop(['Marriage 3','Edu 7'], axis=1)

X_valid = pd.DataFrame(X.iloc[-len(valid_data):])
y_valid = train_data.iloc[-len(valid_data):,-1] 
X.drop(X.tail(len(valid_data)).index, inplace=True)
y.drop(y.tail(len(valid_data)).index, inplace=True)

for i in [0,1,2,3,4,5,8]:
    X1 = X.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X.iloc[:,i] = (X.iloc[:,i]-mean)/std
    X_valid.iloc[:,i] = (X_valid.iloc[:,i]-mean)/std

### Create a custom score function for cross-validation

In [None]:
def custom_loss(ground_truth, predictions):
    TN, FP, FN, TP = confusion_matrix(ground_truth,predictions,sample_weight=None).ravel()
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    return (TPR+TNR)/2

my_custom_loss = make_scorer(custom_loss,greater_is_better=True)

### Import a number of classifiers from sklearn.  <a href="https://xgboost.readthedocs.io/en/latest/index.html" target="_blank">

In [None]:
# Starter Code Imports
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# Our Extra Imports
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Import for combining classifiers
from sklearn.ensemble import VotingClassifier


### Fitting various models using the data.   Here I use the default options for each classifier and fit the model using the training data and verify its performance on the validation data. 

In [None]:
"""
These are the 16 models I test
"""
# List to hold the models 
default_models = []
# 8 models given in starter code:
default_models.append(('Default LDA',LinearDiscriminantAnalysis()))
default_models.append(('Default LR',LogisticRegression()))
default_models.append(('Default SVM',LinearSVC()))
default_models.append(('Default KNN',KNeighborsClassifier()))
default_models.append(('Default DecisionTree',DecisionTreeClassifier(random_state=0)))
default_models.append(('Default RandomForest',RandomForestClassifier(random_state=0)))
default_models.append(('Default GradientBoost',GradientBoostingClassifier(random_state=0)))
default_models.append(('Default XGBoost',XGBClassifier(random_state=0)))
# 8 new models I added:
default_models.append(('Default Ridge',RidgeClassifier()))
default_models.append(('Default BaggingClassifier',BaggingClassifier()))
default_models.append(('Default Gaussian Naive Bayes',GaussianNB()))
default_models.append(('Default QDA',QuadraticDiscriminantAnalysis()))
default_models.append(('Default Multi-layerPerceptron',MLPClassifier()))
default_models.append(('Default ExtraTreesClassifier',ExtraTreesClassifier()))
default_models.append(('Default AdaBoostClassifier',AdaBoostClassifier()))
default_models.append(('Default GaussianProcessClassifier',GaussianProcessClassifier()))

# Test the performance of the default classifiers
for classifier, model in default_models:
    model.fit(X,y)
    y_pred = model.predict(X)
    y_valid_pred = model.predict(X_valid)
    train_accuracy = accuracy_score(y,y_pred)
    valid_accuracy = accuracy_score(y_valid,y_valid_pred)
    TN, FP, FN, TP = confusion_matrix(y_valid,y_valid_pred,sample_weight=None).ravel()
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    print("Classifier: {}".format(classifier))
    ### I have commented out the accuracy data because Ifound it redundant - but I have kept it here for reference purposes
    #print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
    #print("Performance on Validation Data:")
    #print("Accuracy Score = {:.4f}".format(valid_accuracy))    
    #print("True Positive Rate = {:.4f}".format(TPR))
    #print("True Negative Rate = {:.4f}".format(TNR))
    print("Average of True Positive and True Negative Rates = {:.4f}".format( (TPR+TNR)/2 ))
    print("\n")

Classifier: Default LDA
Average of True Positive and True Negative Rates = 0.8804


Classifier: Default LR
Average of True Positive and True Negative Rates = 0.8712


Classifier: Default SVM
Average of True Positive and True Negative Rates = 0.8712


Classifier: Default KNN
Average of True Positive and True Negative Rates = 0.8340


Classifier: Default DecisionTree
Average of True Positive and True Negative Rates = 0.7862


Classifier: Default RandomForest
Average of True Positive and True Negative Rates = 0.8526


Classifier: Default GradientBoost
Average of True Positive and True Negative Rates = 0.8827


Classifier: Default XGBoost
Average of True Positive and True Negative Rates = 0.8723


Classifier: Default Ridge
Average of True Positive and True Negative Rates = 0.8804


Classifier: Default BaggingClassifier
Average of True Positive and True Negative Rates = 0.8517


Classifier: Default Gaussian Naive Bayes
Average of True Positive and True Negative Rates = 0.8132


Classifier: 

### Here, I tune each of the classifiers. The parameters selected for tuning are shown for each Classifier. Next, Grid Search is used to find the best parameters using the options provided in the grids

In [None]:
"""
NOTE: This cell takes ~10 minutes to execute. This cell can be skipped during execution and the rest of the code will work as intended.
Here, I tune all the models that can be tuned. If a classifier cannot be tuned, Imention it below.
- 3 models saw no change after parameter tuning (LDA, LR, R)
- 6 models saw improvements after parameter tuning (KNN, SVM, DT, XGB, QDA, ABC)
- 4 models saw a worsening in performance after tuning (RF, GB, BC, ETC)
- 2 models did not have tuning capabilities/could not be meaningfully tuned (GNB, GP)
- 1 model was not tuned due to prohibitively long execution time (MLP)

Grid Search was used to tune all the parameters for the classifiers. The parameters to tune
were taken from the sklearn documentation. Unless otherwise stated, it served as the reference
for all parameter tuning.
https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
For XGBoost, I used:
https://xgboost.readthedocs.io/en/latest/parameter.html
Other online sources were browsed to compare which parameters were tuned.
There was a significant trade-off between the number of parameters Ituned and the time it took
For this reason, I believe that the reason why 4 models saw worse performance was due to inadequate parameter tuning provided.
Since Grid Search takes very long to run with large inputs, Isaved the computationally intensive tuning
for the classifiers that naturally performed better.
"""

# List to hold all the tuned models
tuned_models = []

#Tuned LDA
solver = ['lsqr', 'svd', 'eigen'] # The different ways of solving the LDA
shrinkage = ['auto', None] # Whether or not to use shrinkage in calculating LDA
grid = dict(solver=solver, shrinkage=shrinkage)
tuned_models.append(('Tuned LDA',LinearDiscriminantAnalysis(),grid))

# Tuned LR
penalty = ['l1', 'l2','none'] # Penalty methods in fitting the LR
C = [0.1, 1, 10, 100] # C-values used to calculate the LR
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # The solving algorithm used
grid = dict(penalty=penalty, C=C, solver=solver)
tuned_models.append(('Tuned LR',LogisticRegression(),grid))
    
# Tuned SVM
C = [100, 50, 10, 5, 1.0, 0.1, 0.01] # C-values used to calculate the SVM
gamma = ['scale'] # Defines influence per training example
kernel = ['linear', 'poly', 'rbf', 'sigmoid'] # The method used to calculate decision function
grid = dict(kernel=kernel, C=C, gamma=gamma)
tuned_models.append(('Tuned SVM',SVC(),grid)) ### I generalzie it to SVC, which includes the Linear included in the started code as well as poly, rbf, sigmoid

# Tuned KNN
n_neighbors = range(1,21) # The number of neighbours it uses in the queries
weights = ['uniform', 'distance'] # How the weights for each value are calculated
metric = ['euclidean', 'manhattan', 'minkowski'] # How distance is measured
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
tuned_models.append(('Tuned KNN',KNeighborsClassifier(),grid))

# Tuned DecisionTree
criterion= ['gini', 'entropy'] # Measure of impurity calculation
max_depth = [4,6,8,12] # Number of nodes a tree may have, to prevent overfitting
grid = dict(criterion=criterion, max_depth=max_depth)
tuned_models.append(('Tuned DecisionTree',DecisionTreeClassifier(), grid))

# Tuned RandomForest
n_estimators = [10, 100] # Number of trees in the forest
max_features = ['sqrt', 'log2'] # Number of features to consider when deciding to split
grid = dict(n_estimators=n_estimators, max_features=max_features)
tuned_models.append(('Tuned RandomForest',RandomForestClassifier(), grid))

# Tuned GradientBoost
n_estimators = [10, 100,] # Number of boosting stages to perform
learning_rate = [0.001, 0.01, 0.1] # Shrink factor for the contribution per tree
subsample = [0.5, 0.7, 1.0] # Number of subsamples to be used for fitting
max_depth = [3, 7, 9] # max number of nodes per tree
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
tuned_models.append(('Tuned GradientBoost',GradientBoostingClassifier(), grid))

# Tuned XGBoost
n_estimators = [10, 100] # Number of boosting stages to perform
learning_rate = [0.001, 0.01, 0.1] # Shrink factor for the contribution per tree
subsample = [0.5, 0.7, 1.0] # Number of subsamples to be used for fitting
max_depth = [3, 7, 9] # max number of nodes per tree
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
tuned_models.append(('Tuned XGBoost',XGBClassifier(), grid))

# Tuned Ridge
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] # Regularization strength
grid = dict(alpha=alpha)
tuned_models.append(('Tuned Ridge',RidgeClassifier(), grid))

# Tuned BaggingClassifier
n_estimators = [10, 100] # Number of base estimators
grid = dict(n_estimators=n_estimators)
tuned_models.append(('Tuned BaggingClassifier',BaggingClassifier(), grid))

# Tuned GaussianNaiveBayes
# There is no parameter tuning for Bayesian Classifiers! Hence no tuned Bayesian Classifier

# Tuned QDA
reg_param = [0.1, 0.2, 0.3, 0.4, 0.5] # Regularization strength
grid = dict(reg_param=reg_param)
tuned_models.append(('Tuned QDA',QuadraticDiscriminantAnalysis(), grid))

# Tuned Multi-layerPerceptron
# NOTE: tuning this neural network was prohibitively time-consuming (many hours were required to execute);
# Ihave included the parameters here to demonstrate the code that I intended to use;
activation = ['identity', 'logistic', 'tanh', 'relu'] # Activation function for the hidden layer
solver = ['lbfgs', 'sgd', 'adam'] # Solver for weight optimization
alpha = [0.0001, 0.05] # L2 penalty (regularization parameter)
learning_rate = ['constant', 'adaptive', 'invscaling'] # Marginal learning rate for weight updates
grid = dict(activation=activation, solver=solver, alpha=alpha, learning_rate=learning_rate)
# tuned_models.append(('Tuned MLP',MLPClassifier(), grid))

# Tuned ExtraTreesClassifier
n_estimators = [1, 10, 100] # Number of trees in the forest
criterion = ['gini', 'entropy'] # Measure of impurity calculation
grid = dict(n_estimators=n_estimators, criterion=criterion)
tuned_models.append(('Tuned ExtraTreesClassifier',ExtraTreesClassifier(), grid))

# Tuned AdaBoostClassifier
algorithm = ['SAMME', 'SAMME.R'] # Algorithm used to calculate the boosting
n_estimators = [1, 10, 50, 100] # Number of boosting stages to perform
grid = dict(n_estimators=n_estimators, algorithm=algorithm)
tuned_models.append(('Tuned AdaBoostClassifier',AdaBoostClassifier(), grid))

# Tuned GaussianProcessClassifier
# The parameters provided do not allow for meaningful parameter tuning for this classifier

print("Tuned Algorithms!")
# Tuned algorithms
for classifier, model, grid in tuned_models:
    # The following two lines of code are the same for all Grid Search parameter tuning
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring=my_custom_loss, error_score=0)

    #### Everything below this is the same
    model = grid_search.fit(X, y)
    y_pred = model.predict(X)
    y_valid_pred = model.predict(X_valid)
    train_accuracy = accuracy_score(y,y_pred)
    valid_accuracy = accuracy_score(y_valid,y_valid_pred)
    TN, FP, FN, TP = confusion_matrix(y_valid,y_valid_pred,sample_weight=None).ravel()
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    print("Classifier: {}".format(classifier))
    ### Ihave commented out the accuracy data because I found it redundant - but Ihave kept it here for reference purposes
    #print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
    #print("Performance on Validation Data:")
    #print("Accuracy Score = {:.4f}".format(valid_accuracy))    
    #print("True Positive Rate = {:.4f}".format(TPR))
    #print("True Negative Rate = {:.4f}".format(TNR))
    print("Average of True Positive and True Negative Rates = {:.4f}".format( (TPR+TNR)/2 ))

    ### Code added in to print the best parameter values
    print("Best output using %s" % (model.best_params_))
    print("\n")

Tuned Algorithms!
Classifier: Tuned LDA
Average of True Positive and True Negative Rates = 0.8804
Best output using {'shrinkage': 'auto', 'solver': 'lsqr'}


Classifier: Tuned LR
Average of True Positive and True Negative Rates = 0.8712
Best output using {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}


Classifier: Tuned SVM
Average of True Positive and True Negative Rates = 0.8804
Best output using {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


Classifier: Tuned KNN
Average of True Positive and True Negative Rates = 0.8175
Best output using {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}


Classifier: Tuned DecisionTree
Average of True Positive and True Negative Rates = 0.8496
Best output using {'criterion': 'entropy', 'max_depth': 4}


Classifier: Tuned RandomForest
Average of True Positive and True Negative Rates = 0.8630
Best output using {'max_features': 'sqrt', 'n_estimators': 100}


Classifier: Tuned GradientBoost
Average of True Positive and True Negative

# In this step, I find the optimal threshold for each of the default and tuned classifiers. As a Bonus, I also use the VotingClassifier class to combine the classifiers. Grid Search is then used to find the optimal weighting to assign to this combination classifier.

In [None]:
"""
Here, I tune the threshold for all the classifiers
This includes the (1) Default, (2) Tuned, and the (3) Combination classifiers
"""
# Create list to hold data for threshold output
check_thresholds = []

# Defaults classifiers
dlda = LinearDiscriminantAnalysis()
check_thresholds.append(dlda)
dlr = LogisticRegression()
check_thresholds.append(dlr)
dsvm = LinearSVC()
# check_thresholds.append(dsvm) ####### can't be used in the loop since it cannot give probabilities
dknn = KNeighborsClassifier()
check_thresholds.append(dknn)
ddt = DecisionTreeClassifier(random_state=0)
check_thresholds.append(ddt)
drf = RandomForestClassifier(random_state=0)
check_thresholds.append(drf)
dgb = GradientBoostingClassifier(random_state=0)
check_thresholds.append(dgb)
dxgb = XGBClassifier(random_state=0)
check_thresholds.append(dxgb)
dr = RidgeClassifier() 
# check_thresholds.append(dr) ######## can't be used in the loop since it cannot give probabilities
dbg = BaggingClassifier()
check_thresholds.append(dbg)
dgnb = GaussianNB()
check_thresholds.append(dgnb)
dqda = QuadraticDiscriminantAnalysis()
check_thresholds.append(dqda)
dmlp = MLPClassifier()
check_thresholds.append(dmlp)
detc = ExtraTreesClassifier()
check_thresholds.append(detc)
dabc = AdaBoostClassifier()
check_thresholds.append(dabc)
dgpc = GaussianProcessClassifier()
check_thresholds.append(dgpc)

# Tuned classifiers
# The parameters used here are the best ones given from Grid Search in the above cell
tlda = LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr')
check_thresholds.append(tlda)
tlr = LogisticRegression(C=1, penalty='l2', solver='newton-cg')
check_thresholds.append(tlr)
tsvm = SVC(C=0.1, gamma='scale', kernel='linear')
# check_thresholds.append(tsvm) ############################### can't be used in the loop
tknn = KNeighborsClassifier(metric='manhattan', n_neighbors=13, weights='distance')
check_thresholds.append(tknn)
tdt = DecisionTreeClassifier(criterion='entropy', max_depth=4)
check_thresholds.append(tdt)
trf = RandomForestClassifier(max_features='log2', n_estimators=100)
check_thresholds.append(trf)
tgb = GradientBoostingClassifier(learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.5) ############################ OUR BEST classifier
check_thresholds.append(tgb)
txgb = XGBClassifier(learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.5)
check_thresholds.append(txgb)
tr = RidgeClassifier(alpha=0.1)
# check_thresholds.append(tr) ############################### can't be used in the loop
tbg = BaggingClassifier(n_estimators=100)
check_thresholds.append(tbg)
tqda = QuadraticDiscriminantAnalysis(reg_param=0.1)
check_thresholds.append(tqda)
tetc = ExtraTreesClassifier(criterion='entropy', n_estimators=100)
check_thresholds.append(tetc)
tabc = AdaBoostClassifier(algorithm='SAMME', n_estimators=10)
check_thresholds.append(tabc)


# --------------------------------------------Combination classifier!-----------------------------------------------------------
# This combination combines 14 classifiers (Ridge and SVM do not support the 'predict_proba' method hence they cannot be used)
# It uses Grid Search to determine the optimal weight of each classifier to use
# For each classifier, Iuse either the default or tuned version, depending on which performs better
# Reference: https://scikit-learn.org/stable/modules/ensemble.html
combo=VotingClassifier(estimators=[('TunedKNNeighbours', tknn), ('TunedLinearDiscrimantAnalysis', tlda), ('TunedLogisticRegression', tlr),
                                  ('TunedDecisionTree', tdt), ('DefaultRandomForest', drf), ('DefaultGradientBoosting', dgb),
                                  ('TunedXGBoost', txgb), ('DefaultBaggingClassifier', dbg), ('TunedQuadraticDiscriminantAnalysis', tqda),
                                  ('DefaultExtraTrees', detc), ('TunedAdaBoost', tabc), ('DefaultGaussianNB', dgnb),
                                  ('DefaultMultilayerPerceptron', dmlp), ('DefaultGaussianProcess', dgpc)], voting='soft')
params = {} # Since the params are already specified in the classifiers, there is no need to define additional ones here
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
combo = GridSearchCV(estimator=combo, param_grid=params, cv=cv)
check_thresholds.append(combo)

# Fit all the classifiers
for c in check_thresholds:
  c.fit(X,y)

# Find the optimal threshold for each classifier (prints the classifiers in order)
for c in check_thresholds:
  THRESHOLD = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  results = pd.DataFrame(columns=["THRESHOLD", "accuracy", "true positive rate", "true negative rate", "(TPR+TNR)/2"]) # df to store results
  results['THRESHOLD'] = THRESHOLD                                                           # threshold column
  Q = c.predict_proba(X_valid)[:,1]
  running_max = [0,0] # threshold, running max
  for i in range(9):                                                                         # iterate over each threshold    
    preds = np.where(Q>THRESHOLD[i], 1, 0)                                                 # if prob > threshold, predict 1   
    valid_accuracy = accuracy_score(y_valid,preds)
    TN, FP, FN, TP = confusion_matrix(y_valid,preds,sample_weight=None).ravel()
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    results.iloc[i,1] = valid_accuracy 
    results.iloc[i,2] = TPR
    results.iloc[i,3] = TNR                                                              
    results.iloc[i,4] = (TPR+TNR)/2
    if (TPR+TNR)/2 > running_max[1]: 
      running_max[1] = (TPR+TNR)/2 
      running_max[0] = THRESHOLD[i]
  results.style.hide_index()
  #print(results.T.to_string(header=False)) # Don't need to see all the redundant data so I comment it out
  print(str(c)[0:13] + ", Threshold: " + str(running_max[0]) + ", max " + str(running_max[1]))
  print('\n')

LinearDiscrim, Threshold: 0.2, max 0.8804347826086957


LogisticRegre, Threshold: 0.4, max 0.885643115942029


KNeighborsCla, Threshold: 0.4, max 0.8340126811594203


DecisionTreeC, Threshold: 0.1, max 0.786231884057971


RandomForestC, Threshold: 0.3, max 0.870018115942029


GradientBoost, Threshold: 0.5, max 0.8826992753623188


XGBClassifier, Threshold: 0.6, max 0.880661231884058


BaggingClassi, Threshold: 0.3, max 0.8731884057971014


GaussianNB(pr, Threshold: 0.1, max 0.8534873188405797


QuadraticDisc, Threshold: 0.3, max 0.7753623188405797


MLPClassifier, Threshold: 0.6, max 0.8713768115942029


ExtraTreesCla, Threshold: 0.3, max 0.8659420289855073


AdaBoostClass, Threshold: 0.5, max 0.8546195652173914


GaussianProce, Threshold: 0.5, max 0.8360507246376812


LinearDiscrim, Threshold: 0.2, max 0.8804347826086957


LogisticRegre, Threshold: 0.4, max 0.885643115942029


KNeighborsCla, Threshold: 0.4, max 0.8525815217391304


DecisionTreeC, Threshold: 0.1, max 0.870018115942029


## I find that the XGBoost Classifier with learning_rate=0.01, max_depth=7, n_estimators=100, and subsample=0.5 provides the best (TPR+TNR)/2. Further, this occurs at an optimal threshold of 0.5.

Interestingly, the XGBoost Classifier works better than even the combination classifier, which seems to suggest that the other classifiers are not as efficient and do not generate any predictive value in combination with the XGBoost Classifier.

In [None]:
train_data = pd.read_csv('train_data.csv',index_col=0)
### In the real test, sample_test_data.csv will be replaced by test_data.csv
eval_data = pd.read_csv('sample_test_data.csv',index_col=0)
train_data = pd.concat([train_data, eval_data], axis=0)
train_data.index = range(len(train_data))
train_data.loc[train_data['OCCUPATION']!=1,'OCCUPATION'] = 0
cat_vars = ['MARRIAGE', 'EDUCATION']
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] 
encoded_tr = [encoders[i].fit_transform(train_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]
X = pd.concat([train_data.iloc[:,:-1].drop(cat_vars, axis=1), pd.DataFrame(np.concatenate(encoded_tr, axis=1))], axis=1)
y = train_data.iloc[:,-1] 
X = X.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})
X = X.drop(['Marriage 3','Edu 7'], axis=1)

X_test = pd.DataFrame(X.iloc[-len(eval_data):])
y_test = train_data.iloc[-len(eval_data):,-1] 
X.drop(X.tail(len(eval_data)).index, inplace=True)
y.drop(y.tail(len(eval_data)).index, inplace=True)

for i in [0,1,2,3,4,5,8]:
    X1 = X.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X.iloc[:,i] = (X.iloc[:,i]-mean)/std
    X_test.iloc[:,i] = (X_test.iloc[:,i]-mean)/std

In [None]:
### THIS IS THE CHOSEN CLASSIFIER
### THRESHOLD OF 0.5 is used
### txbg is the name of the chosen classifier
THRESHOLD = 0.5
txbg = XGBClassifier(n_estimators=100, learning_rate=0.1, subsample=0.5, max_depth=7)
txbg.fit(X, y)
Q = txbg.predict_proba(X_test)[:,1]
y_test_pred = np.where(Q>THRESHOLD, 1, 0)
print(y_test_pred)

[0]
