In [488]:
import pandas as pd
import numpy as np
import sklearn
from statistics import mean 
from sklearn import linear_model,naive_bayes,neighbors,ensemble
from sklearn import preprocessing,model_selection
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier

In [489]:
from sklearn import metrics as skmetrics
class ClassificationMetrics():
    def __init__(self):
        self.metrics={
            'accuracy':self._accuracy,
            'f1':self._f1,
            'recall':self._recall,
            'precision':self._precision,
            'auc':self._auc,
            'logloss':self._logloss
        }
        
    def __call__(self,metric,y_test,y_pred,y_proba=None):
        if metric not in self.metrics:
            raise Exception('Metric not known')
        if metric=='auc':
            if y_proba is not None:
                return self._auc(y_test=y_test,y_pred=y_proba)
            else:
                raise Exception('y_proba can\'t be none')
        if metric=='logloss':
            if y_proba is not None:
                return self._logloss(y_test=y_test,y_pred=y_proba)
            else:
                raise Exception('y_proba can\'t be none')
        else:
            return self.metrics[metric](y_test=y_test,y_pred=y_pred)
    
    @staticmethod
    def _accuracy(y_test,y_pred):
        return skmetrics.accuracy_score(y_true=y_test,y_pred=y_pred)
    
    @staticmethod
    def _f1(y_test,y_pred):
        return skmetrics.f1_score(y_true=y_test,y_pred=y_pred)
    
    @staticmethod
    def _recall(y_test,y_pred):
        return skmetrics.recall_score(y_true=y_test,y_pred=y_pred)
    
    @staticmethod
    def _precision(y_test,y_pred):
        return skmetrics.precision_score(y_true=y_test,y_pred=y_pred)
    
    @staticmethod
    def _auc(y_test,y_pred):
        return skmetrics.roc_auc_score(y_true=y_test,y_score=y_pred)
    
    @staticmethod
    def _logloss(y_test,y_pred):
        return skmetrics.log_loss(y_true=y_test,y_pred=y_pred)

In [491]:
train_backup=pd.read_csv('train.csv')
test_backup=pd.read_csv('test.csv')
sample=pd.read_csv('sample_submission.csv')

# Preprocessing

In [534]:
train=train_backup.copy()
test=test_backup.copy()

In [535]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [536]:
train.shape

(614, 13)

In [537]:
#checking nulls
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [538]:
#imputing missing values:
col=['Gender','Married','Self_Employed','Dependents','Credit_History','Loan_Amount_Term']
cols=[]
for c in col:
    train.loc[:,c]=train.loc[:,col].fillna('-99999').astype(str)

In [539]:
#number of catergories
for c in train.columns.values:
    print(c,train[c].nunique())

Loan_ID 614
Gender 3
Married 3
Dependents 5
Education 2
Self_Employed 3
ApplicantIncome 505
CoapplicantIncome 287
LoanAmount 203
Loan_Amount_Term 11
Credit_History 3
Property_Area 3
Loan_Status 2


In [540]:
#categorical encoding
cols_le=['Dependents','Education','Property_Area','Loan_Amount_Term','Loan_Status']
for c in cols_le:
    le=preprocessing.LabelEncoder()
    le.fit(train[c].values)
    train.loc[:,c]=le.transform(train[c].values)

In [541]:
cols=['Gender','Married','Self_Employed','Credit_History']
for c in cols:
    df=pd.get_dummies(train[c],prefix=c,prefix_sep=':')
    train=pd.concat([train,df],axis=1)

In [542]:
train.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Gender:-99999', 'Gender:Female', 'Gender:Male', 'Married:-99999',
       'Married:No', 'Married:Yes', 'Self_Employed:-99999', 'Self_Employed:No',
       'Self_Employed:Yes', 'Credit_History:-99999', 'Credit_History:0.0',
       'Credit_History:1.0'],
      dtype='object')

In [550]:
train=train.drop(['Gender','Married','Self_Employed','Gender:-99999','Married:-99999','Self_Employed:-99999','Credit_History:-99999','Credit_History'],axis=1)

In [544]:
train=train.fillna(0)

In [551]:
train.head()

Unnamed: 0,Loan_ID,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Property_Area,Loan_Status,Gender:Female,Gender:Male,Married:No,Married:Yes,Self_Employed:No,Self_Employed:Yes,Credit_History:0.0,Credit_History:1.0
0,LP001002,1,0,0.070489,0.0,0.0,7,2,1,0,1,1,0,1,0,0,1
1,LP001003,2,0,0.05483,0.036192,0.182857,7,0,0,0,1,0,1,1,0,0,1
2,LP001005,1,0,0.03525,0.0,0.094286,7,2,1,0,1,0,1,0,1,0,1
3,LP001006,1,1,0.030093,0.056592,0.171429,7,2,1,0,1,0,1,1,0,0,1
4,LP001008,1,0,0.072356,0.0,0.201429,7,2,1,0,1,1,0,1,0,0,1


In [546]:
# Normalization
col_nor=['ApplicantIncome','CoapplicantIncome','LoanAmount']
for c in col_nor:
    sc=preprocessing.MinMaxScaler(feature_range=(0,1),copy=False)
    sc.fit(train[c].values.reshape(-1,1))
    train[c]=sc.transform(train[c].values.reshape(-1,1))

In [None]:
#train.to_csv('prepTrain.csv')   

# Cross Validation
## *StratifiedKFold*

In [594]:
df=train.copy(deep=True)
df['kfold']=-1   
df=df.sample(frac=1).reset_index(drop=True) # just resampling the dataset
kf=model_selection.StratifiedKFold(n_splits=5,shuffle=False)
    
for fold,(train_idx,val_idx) in enumerate(kf.split(X=df,y=df.Loan_Status.values)):
    print(len(train_idx),len(val_idx))
    df.loc[val_idx,'kfold']=fold

491 123
491 123
491 123
491 123
492 122


# Training Different Models and Metrics calculation

In [595]:
# Base models
models={
    'logistic':linear_model.LogisticRegression(C=0.08858667904100823, penalty='l1', solver='liblinear'),
    'naive_bayes':naive_bayes.GaussianNB(),
    'knn':neighbors.KNeighborsClassifier(n_neighbors= 6, p= 2, weights= 'uniform'),
    'rf':ensemble.RandomForestClassifier(n_estimators= 150,criterion= 'entropy' ),
    'extra_trees':ensemble.ExtraTreesClassifier(),
    'svm':sk.svm.SVC(C= 0.1, gamma= 'auto', kernel= 'linear',probability=True),
    'xgb':XGBClassifier(n_estimators=100,eta=0.015,max_depth=3,gamma=0.5,min_child_weight=9,reg_alpha=0.7,reg_lambda=0.3)
}


In [596]:

Fold_mapping={
    0:[1,2,3,4],
    1:[0,2,3,4],
    2:[0,1,3,4],
    3:[0,1,2,4],
    4:[0,1,2,3]
}
param=['accuracy','recall','precision','logloss','auc']
mod_list=['logistic','naive_bayes','knn','rf','extra_trees','svm','xgb']
for mod in mod_list:
    metric_dict=dict()
    for Fold in range(5):
        train_df=df[df.kfold.isin(Fold_mapping.get(Fold))]
        valid_df=df[df.kfold==Fold]


        y_train=train_df.Loan_Status.values
        y_valid=valid_df.Loan_Status.values

        x_train=train_df.drop(['Loan_ID','Loan_Status','kfold'],axis=1)
        x_valid=valid_df.drop(['Loan_ID','Loan_Status','kfold'],axis=1)

        x_valid=x_valid[x_train.columns]    # to make the order same
        #models
        clf=models[mod]
        clf.fit(x_train,y_train)
        y_pred=clf.predict(x_valid)
        y_prob=clf.predict_proba(x_valid)[:,1]
        #metrics
        a=ClassificationMetrics()
        for p in param:
            if p in metric_dict:
                try:
                    metric_dict[p].append(a(p,y_valid,y_pred))
                except:
                    metric_dict[p].append(a(p,y_valid,y_pred,y_prob))
            else:
                try:
                    metric_dict[p]=[a(p,y_valid,y_pred)]
                except:
                    metric_dict[p]=[a(p,y_valid,y_pred,y_prob)]
    print(mod)
    print(f"Average Accuracy:{mean(metric_dict['accuracy'])}")
    print(f"Average Recall:{mean(metric_dict['recall'])}")
    print(f"Average Precision:{mean(metric_dict['precision'])}")
    print(f"Average Logloss:{mean(metric_dict['logloss'])}")
    print(f"Average AUC:{mean(metric_dict['auc'])}")

logistic
Average Accuracy:0.809516193522591
Average Recall:0.9833893557422969
Average Precision:0.7914564480226342
Average Logloss:0.5012927633872821
Average AUC:0.7083446928028971
naive_bayes
Average Accuracy:0.7737038517926162
Average Recall:0.9099719887955182
Average Precision:0.7927771238662328
Average Logloss:1.3594031973208072
Average AUC:0.7303432790120097
knn
Average Accuracy:0.765453818472611
Average Recall:0.9240056022408963
Average Precision:0.7777953948875148
Average Logloss:2.26036425826646
Average AUC:0.6906155282625871
rf
Average Accuracy:0.7915500466480074
Average Recall:0.9406442577030812
Average Precision:0.7950189122239392
Average Logloss:0.5151787585616046
Average AUC:0.7410923141186299
extra_trees
Average Accuracy:0.7703718512594963
Average Recall:0.9003361344537815
Average Precision:0.7936619028845522
Average Logloss:0.8024506482952761
Average AUC:0.7413994450681757
svm
Average Accuracy:0.809516193522591
Average Recall:0.9833893557422969
Average Precision:0.791456

#### Logistic,Random Forest and SVC are only good models

## Previsous metrics
### logistic
Average Accuracy:0.7671064907370385
Average Recall:0.8958263305322128
Average Precision:0.7931367691448337
Average Logloss:0.5404100848288785
Average AUC:0.7131288817821325
### naive_bayes
Average Accuracy:0.7426096228175396
Average Recall:0.8435574229691877
Average Precision:0.7947590563759872
Average Logloss:0.9313870165224841
Average AUC:0.6985670624525113
### knn
Average Accuracy:0.7214847394375583
Average Recall:0.9359103641456583
Average Precision:0.7328899090925652
Average Logloss:2.474598830947766
Average AUC:0.6404954221904686
### rf
Average Accuracy:0.7394375583100093
Average Recall:0.8745098039215686
Average Precision:0.7757310879948649
Average Logloss:0.5568700269995099
Average AUC:0.7257576633892424
### extra_trees
Average Accuracy:0.7133813141410102
Average Recall:0.8389355742296919
Average Precision:0.7663507001272959
Average Logloss:0.72797666494634
Average AUC:0.6933479532163743
### svm
Average Accuracy:0.7085032653605224
Average Recall:0.9502801120448179
Average Precision:0.7195772229967209
Average Logloss:0.5446935716467013
Average AUC:0.7169051966265588
### xgb
Average Accuracy:0.7084366253498601
Average Recall:0.8245378151260504
Average Precision:0.7684979116255414
Average Logloss:0.792259316124252
Average AUC:0.7103909661030404

## After Tuning
logistic
Average Accuracy:0.8094228975076636
Average Recall:0.9834173669467787
Average Precision:0.7907408959372204
Average Logloss:0.49970764739757206
Average AUC:0.7058014001822052
### naive_bayes
Average Accuracy:0.7767959482873518
Average Recall:0.9123809523809524
Average Precision:0.793683694479851
Average Logloss:1.2673525484886972
Average AUC:0.7375304683276819
### knn
Average Accuracy:0.7507930161268825
Average Recall:0.9169747899159664
Average Precision:0.7665813886134883
Average Logloss:2.5865704974520027
Average AUC:0.6700294287755588
### rf
Average Accuracy:0.7882580301212848
Average Recall:0.9337254901960784
Average Precision:0.7953857165936374
Average Logloss:0.5102461414080115
Average AUC:0.7477308750760763
### extra_trees
Average Accuracy:0.765453818472611
Average Recall:0.8958263305322128
Average Precision:0.7911971183511737
Average Logloss:0.8242519904568911
Average AUC:0.7280337816638126
### svm
Average Accuracy:0.8094228975076636
Average Recall:0.9834173669467787
Average Precision:0.7907408959372204
Average Logloss:0.481464994229267
Average AUC:0.7155270151245383
### xgb
Average Accuracy:0.8094228975076636
Average Recall:0.9834173669467787
Average Precision:0.7907408959372204
Average Logloss:0.49776480110839066
Average AUC:0.7304500599160042

# Tuning of Models and Best Parameters

In [272]:
#data for hyper tuning parameters
x_ht=df.drop(['Loan_ID','Loan_Status','kfold'],axis=1)
y_ht=df.Loan_Status

In [375]:
#Grid Search
param_log={'penalty':['l1','l2'],
           'C':np.logspace(-4, 4, 20),
           'solver':['liblinear']
}
param_rf={
    'n_estimators':list(range(10,200,10)),
    'criterion':['gini','entropy']
}
param_knn={
    'n_neighbors':list(range(2,8)),
    'weights':['uniform','distance'],
    'p':[1,2]
}
param_svm={
    'C':[x*0.1 for x in range(1,10)],
    'kernel':['linear','rbf'],
    'gamma':['auto']
}
param_xgb={
    'n_estimaators':list(range(110,200,10)),
    'eta':[0.001,0.015,0.025,0.05,1],
    'gamma':np.arange(0,1,0.2),
    'max_depth':list(range(1,15,2)),
    'min_child_weight':[1,2,5,7],
    'reg_alpha':[0,0.1,0.5,1],
    'reg_lamda':[0.01,0.03,0.05,0.07,0.1,0.3,0.5,0.7,1]
    
}

In [557]:
clf_gs_log=linear_model.LogisticRegression()
clf_gs_knn=neighbors.KNeighborsClassifier()
clf_gs_rf=ensemble.RandomForestClassifier()
clf_gs_svm=sk.svm.SVC()
# clf_gs_xgb=XGBClassifier()

In [558]:
gs_log=GridSearchCV(clf_gs_log, param_grid = param_log, cv = 5, verbose=True, n_jobs=-1)
gs_rf=GridSearchCV(clf_gs_rf, param_grid = param_rf, cv = 5, verbose=True, n_jobs=-1)
gs_knn=GridSearchCV(clf_gs_knn, param_grid = param_knn, cv = 5, verbose=True, n_jobs=-1)
gs_svm=GridSearchCV(clf_gs_svm, param_grid = param_svm, cv = 5, verbose=True, n_jobs=-1)
# gs_xgb=GridSearchCV(clf_gs_xgb, param_grid = param_xgb,cv=2, verbose=True, n_jobs=-1)

In [559]:
gs_log.fit(x_ht,y_ht)
print(gs_log.best_estimator_,gs_log.best_score_,gs_log.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    2.6s


LogisticRegression(C=0.08858667904100823, penalty='l1', solver='liblinear') 0.7703585232573638 {'C': 0.08858667904100823, 'penalty': 'l1', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.0s finished


In [560]:
gs_rf.fit(x_ht,y_ht)
print(gs_rf.best_estimator_,gs_rf.best_score_,gs_rf.best_params_)

Fitting 5 folds for each of 38 candidates, totalling 190 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 190 out of 190 | elapsed:   20.1s finished


RandomForestClassifier(criterion='entropy', n_estimators=140) 0.7524723443955752 {'criterion': 'entropy', 'n_estimators': 140}


In [342]:
gs_knn.fit(x_ht,y_ht)
print(gs_knn.best_estimator_,gs_knn.best_score_,gs_knn.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KNeighborsClassifier(n_neighbors=6) 0.7215380514460883 {'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.7s finished


In [561]:
gs_svm.fit(x_ht,y_ht)
print(gs_svm.best_estimator_,gs_svm.best_score_,gs_svm.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVC(C=0.1, gamma='auto', kernel='linear') 0.7703585232573638 {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.6s finished


In [562]:
# gs_xgb.fit(x_ht,y_ht)
# print(gs_xgb.best_estimator_,gs_xdb.best_score_,gs_xgb.best_params_)

# Preprocessing test Data

In [567]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [569]:
#checking nulls
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [572]:
#imputing missing values:
col=['Gender','Self_Employed','Dependents','Credit_History','Loan_Amount_Term']
cols=[]
for c in col:
    test.loc[:,c]=test.loc[:,col].fillna('-99999').astype(str)

In [573]:
#number of catergories
for c in test.columns.values:
    print(c,test[c].nunique())

Loan_ID 367
Gender 3
Married 2
Dependents 5
Education 2
Self_Employed 3
ApplicantIncome 314
CoapplicantIncome 194
LoanAmount 144
Loan_Amount_Term 13
Credit_History 3
Property_Area 3


In [574]:
#categorical encoding
cols_le=['Dependents','Education','Property_Area','Loan_Amount_Term']
for c in cols_le:
    le=preprocessing.LabelEncoder()
    le.fit(test[c].values)
    test.loc[:,c]=le.transform(test[c].values)

In [576]:
cols=['Gender','Married','Self_Employed','Credit_History']
for c in cols:
    df=pd.get_dummies(test[c],prefix=c,prefix_sep=':')
    test=pd.concat([test,df],axis=1)

In [577]:
test.columns.values

array(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area', 'Gender:-99999', 'Gender:Female', 'Gender:Male',
       'Married:No', 'Married:Yes', 'Self_Employed:-99999',
       'Self_Employed:No', 'Self_Employed:Yes', 'Credit_History:-99999',
       'Credit_History:0.0', 'Credit_History:1.0'], dtype=object)

In [580]:
test=test.drop(['Loan_ID','Gender','Married','Self_Employed','Gender:-99999','Self_Employed:-99999','Credit_History:-99999','Credit_History'],axis=1)

In [583]:
test=test.fillna(0)

In [584]:
# Normalization
col_nor=['ApplicantIncome','CoapplicantIncome','LoanAmount']
for c in col_nor:
    sc=preprocessing.MinMaxScaler(feature_range=(0,1),copy=False)
    sc.fit(test[c].values.reshape(-1,1))
    test[c]=sc.transform(test[c].values.reshape(-1,1))

# Prediction

In [586]:
test.columns.values

array(['Dependents', 'Education', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term', 'Property_Area', 'Gender:Female',
       'Gender:Male', 'Married:No', 'Married:Yes', 'Self_Employed:No',
       'Self_Employed:Yes', 'Credit_History:0.0', 'Credit_History:1.0'],
      dtype=object)

In [565]:
final_x=train.drop(['Loan_ID','Loan_Status',],axis=1)
final_y=train.Loan_Status.values

In [587]:
# Test Data
test=test[final_x.columns]

In [589]:
test.head()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Property_Area,Gender:Female,Gender:Male,Married:No,Married:Yes,Self_Employed:No,Self_Employed:Yes,Credit_History:0.0,Credit_History:1.0
0,1,0,0.078865,0.0,0.2,8,2,0,1,0,1,1,0,0,1
1,2,0,0.042411,0.0625,0.229091,8,2,0,1,0,1,1,0,0,1
2,3,0,0.068938,0.075,0.378182,8,2,0,1,0,1,1,0,0,1
3,3,0,0.032263,0.106083,0.181818,8,2,0,1,0,1,1,0,0,0
4,1,1,0.045168,0.0,0.141818,8,2,0,1,1,0,1,0,0,1


In [612]:
#tuned models logistic,SVC, XGBoost
clf_log=models['logistic']
clf_svm=models['svm']
clf_xgb=models['xgb']
clf_rf=models['rf']

In [613]:
#Fitted Model
clf_log.fit(final_x,final_y)
clf_svm.fit(final_x,final_y)
clf_xgb.fit(final_x,final_y)
clf_rf.fit(final_x,final_y)

RandomForestClassifier(criterion='entropy', n_estimators=150)

In [614]:
sample['log']=clf_log.predict_proba(test)[:,1]
sample['svm']=clf_svm.predict_proba(test)[:,1]
sample['xgb']=clf_xgb.predict_proba(test)[:,1]
sample['rf']=clf_rf.predict_proba(test)[:,1]

In [615]:
sample.head()

Unnamed: 0,Loan_ID,Loan_Status,log,svm,xgb,Loan_prob,rf
0,LP001015,Y,0.79806,0.793078,0.740109,0.777082,0.82
1,LP001022,Y,0.801555,0.793095,0.717269,0.77064,0.6
2,LP001031,Y,0.805005,0.79314,0.705812,0.767986,0.666667
3,LP001035,Y,0.671379,0.793004,0.780198,0.748194,0.753333
4,LP001051,Y,0.773141,0.792997,0.723939,0.763359,0.426667


In [616]:
sample['Loan_prob']=sample[['log','svm','xgb','rf']].mean(axis=1)

In [617]:
sample.head()

Unnamed: 0,Loan_ID,Loan_Status,log,svm,xgb,Loan_prob,rf
0,LP001015,Y,0.79806,0.793078,0.740109,0.787812,0.82
1,LP001022,Y,0.801555,0.793095,0.717269,0.72798,0.6
2,LP001031,Y,0.805005,0.79314,0.705812,0.742656,0.666667
3,LP001035,Y,0.671379,0.793004,0.780198,0.749478,0.753333
4,LP001051,Y,0.773141,0.792997,0.723939,0.679186,0.426667


In [618]:
sample['Loan_Status']=np.where(sample['Loan_prob']>0.5,'Y','N')

In [611]:
sample.head()

Unnamed: 0,Loan_ID,Loan_Status,log,svm,xgb,Loan_prob
0,LP001015,Y,0.79806,0.793078,0.740109,0.777082
1,LP001022,Y,0.801555,0.793095,0.717269,0.77064
2,LP001031,Y,0.805005,0.79314,0.705812,0.767986
3,LP001035,Y,0.671379,0.793004,0.780198,0.748194
4,LP001051,Y,0.773141,0.792997,0.723939,0.763359


In [621]:
final_df=sample[['Loan_ID','Loan_Status']]

In [632]:
final_df.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [638]:
final_df.to_csv('final_df.csv',index=False)