###### Importing External Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report,confusion_matrix,log_loss

###### Loading Train Data

In [2]:
# Loading Train data
train_data = pd.read_csv('../Binary Classification/dataset/training_set.csv', index_col=0)
print('Train data shape: ',train_data.shape)
train_data.head(5)

Train data shape:  (3910, 58)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X49,X50,X51,X52,X53,X54,X55,X56,X57,Y
0,0.0,0.0,4.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.342,0.0,0.0,1.2,2,12,0
1,0.0,0.56,0.56,0.0,1.12,0.56,2.25,0.0,0.0,0.56,...,0.0,0.083,0.0,0.503,0.0,0.083,16.304,148,375,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,5,0
3,0.64,0.0,0.64,0.0,1.93,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.462,0.37,0.0,2.44,22,122,1
4,0.58,0.0,0.0,35.46,0.58,0.0,0.58,0.58,0.0,0.0,...,0.0,0.0,0.0,0.239,0.239,0.0,3.338,123,207,1


###### Loading Test Data

In [3]:
# Loading Test Data

test_data = pd.read_csv('../Binary Classification/dataset/test_set.csv', index_col=0)
print('Test data shape:', test_data.shape)
test_data.head(5)

Test data shape: (691, 57)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X48,X49,X50,X51,X52,X53,X54,X55,X56,X57
0,0.7,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.105,0.0,0.0,2.342,47,89
1,0.0,0.0,0.84,0.0,0.84,0.0,0.84,0.0,0.0,0.0,...,0.0,0.0,0.388,0.0,0.776,0.129,0.0,10.375,168,249
2,0.46,0.3,0.46,0.0,0.05,0.12,0.05,0.28,0.43,0.74,...,0.0,0.0,0.065,0.0,0.325,0.756,0.153,5.891,193,3040
3,0.1,0.2,1.01,0.0,0.8,0.8,0.5,0.0,0.8,0.1,...,0.0,0.0,0.11,0.0,0.49,0.158,0.015,8.55,669,1351
4,0.0,0.0,0.72,0.0,0.72,0.0,0.72,0.0,0.0,0.0,...,0.0,0.0,0.364,0.0,0.729,0.121,0.0,7.781,32,249


###### Defining the exploratory features and target variable

In [4]:
X = train_data.drop(['Y'], axis=1)
y = train_data['Y']

###### Based on the problem statement, We will split the data into train and validation set using 80/20 ratios. 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42)

###### Feature Selection based on Feature Importance

In [6]:
### Feature Selection using RandomForestClassifer

classifier = RandomForestClassifier(100, max_depth=None, n_jobs=1)
classifier.fit(X_train, y_train)
feature_imp = classifier.feature_importances_

In [7]:
# Ranking features based on importance
importances = sorted(zip(X.columns, feature_imp), key=lambda x: x[1], reverse=True)
top_features= [x[0] for x in importances[:30]]

In [8]:
# Selecting top features
X_train_final = X_train[top_features]
X_test_final = X_test[top_features]

In [9]:
# Selecting top features in test data
test_data_final = test_data[top_features]

###### Based on pervious analysis, The selected model is XGBoost Classifier
##### Training the model on train and validation sets

In [10]:
# XGBoost Model Training

cls = XGBClassifier(n_estimators=500,
                    max_depth=5,
                    learning_rate=0.15,
                    colsample_bytree=1,
                    subsample=1,
                    reg_alpha=0.3,
                    gamma=10,
                    n_jobs=2,
                    eval_metric='logloss',
                    use_label_encoder=False)

cls.fit(X_train_final, y_train)

y_train_pred = cls.predict(X_train_final)
y_train_probab = cls.predict_proba(X_train_final)[:,1]

y_test_pred = cls.predict(X_test_final)
y_test_probab = cls.predict_proba(X_test_final)[:,1]


In [11]:
## Logloss
print('Train Log-loss:' , log_loss(y_train, y_train_probab))
print('Validation Log-loss', log_loss(y_test,y_test_probab))

## AUC Score
print('\n')
print('Train AUC Score:', roc_auc_score(y_train, y_train_probab))
print('Validation AUC Score:', roc_auc_score(y_test, y_test_probab))

Train Log-loss: 0.14277005066191228
Validation Log-loss 0.15530222387927228


Train AUC Score: 0.9894357064248345
Validation AUC Score: 0.9879204525972913


###### Predictions on Test Data

In [12]:
# Predicting on Test Data

test_pred = cls.predict(test_data_final)
print(test_pred)

[0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1
 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 0 1 0
 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0
 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0
 0 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1
 1 0 0 1 0 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0
 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1
 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1
 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0
 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0
 0 0 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 1 1 1 1 0
 0 1 0 0 0 0 0 0 0 0 1 0 

In [13]:
test_probab= cls.predict_proba(test_data_final)[:,1]
print(test_probab)

[0.04033171 0.97785157 0.98079556 0.9691561  0.97785157 0.20852894
 0.97279274 0.73098963 0.95876235 0.9805946  0.01199517 0.06750625
 0.22864741 0.01544631 0.07676392 0.11900955 0.02213419 0.13877682
 0.96227586 0.02222986 0.9739934  0.97767943 0.9691878  0.01279566
 0.73592615 0.07698447 0.13139409 0.85809475 0.07698447 0.01394515
 0.9033491  0.05301218 0.01544631 0.04568623 0.01479496 0.04990736
 0.4979711  0.03954214 0.5295856  0.49401128 0.98281235 0.10667392
 0.13877682 0.09131541 0.04722621 0.9440237  0.08039155 0.0202093
 0.03212925 0.9776448  0.88717324 0.15313905 0.0133681  0.9235822
 0.975184   0.97291476 0.02346839 0.02158874 0.37456363 0.9581312
 0.97917295 0.1366199  0.33742198 0.24190742 0.29049578 0.9568
 0.9196574  0.7921646  0.12864298 0.04568623 0.02074446 0.8677428
 0.04568623 0.57246137 0.93341833 0.9678928  0.09657373 0.01244521
 0.33559394 0.04568623 0.07504966 0.94459915 0.929011   0.96571946
 0.04568623 0.01544631 0.08297646 0.04568623 0.96464837 0.01668686
 0.