###### Importing external libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score,log_loss
from prettytable import PrettyTable

###### Loading Train and Test Sets

In [3]:
# Loading train data
train_data= pd.read_csv('../Binary Classification/dataset/training_set.csv', index_col=0)

# Loading test data
test_data = pd.read_csv('../Binary Classification/dataset/test_set.csv', index_col=0)

###### Splitting the data into X and Y values

In [4]:
# Splitting the data
X = train_data.drop(['Y'], axis=1)
y = train_data['Y']

###### Splitting the data into train and validation sets

In [5]:
# Splitting into Train -Test Test
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,stratify=y, random_state=42)

###### Seelcting features using Random Forest Classifier

In [6]:
# Feature Selection using Random Forest Classifier

classifier = RandomForestClassifier(100, max_depth=None, n_jobs=1)
classifier.fit(X_train, y_train)
feature_imp = classifier.feature_importances_

# Feature ranking based on importances
imp = sorted(zip(X.columns,feature_imp), key=lambda x: x[1], reverse=True)

# Extracting top 40 features
top_features = [x[0] for x in imp[:40]]

#Selecting top features from data

X_train_final = X_train[top_features]
X_test_final = X_test[top_features]


###### Normalizing the data using StandardScaler

In [7]:
# Normalizing Data using Standard Scaler
scaler = StandardScaler()
scaler.fit(X_train_final)

# Transorming the data
X_train_Final = pd.DataFrame(scaler.transform(X_train_final), columns=X_train_final.columns)
X_test_Final = pd.DataFrame(scaler.transform(X_test_final), columns=X_test_final.columns)


###### Using PrettyTable to print model performances
In order to use prettytable, open anaconda prompt in administrator mode and- 

1. pip install prettytable 

In [8]:
# Using table to print model performance
table = PrettyTable()
table.field_names = ['Model', 'Train Log-loss', 'Validation Log-Loss', 'Train AUC', 'Validation AUC']

### MODEL TRAINING 

In [9]:
## Training Models

# 1. Random Model
y_train_probab = np.random.rand(len(X_train_Final))
y_test_probab = np.random.rand(len(X_test_Final))

table.add_row(['Random', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])

In [10]:
# 2. Logistic Regression

cls = LogisticRegression(C=1, penalty='l2', max_iter=250, random_state=42)
cls.fit(X_train_Final, y_train)

y_train_pred = cls.predict(X_train_Final)
y_train_probab = cls.predict_proba(X_train_Final)[:,1]

y_test_pred = cls.predict(X_test_Final)
y_test_probab = cls.predict_proba(X_test_Final)[:,1]

table.add_row(['Logistic Regression', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])


In [11]:
# 3. Naive Bayes

cls = GaussianNB()
cls.fit(X_train_Final, y_train)

y_train_pred = cls.predict(X_train_Final)
y_train_probab = cls.predict_proba(X_train_Final)[:,1]

y_test_pred = cls.predict(X_test_Final)
y_test_probab = cls.predict_proba(X_test_Final)[:,1]

table.add_row(['Naive Bayes', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])

In [12]:
# 4. Support Vector Machine

cls = LinearSVC(penalty='l2', max_iter=250, random_state=42, tol=1e-5)
cls.fit(X_train_final, y_train)

y_train_pred = cls.predict(X_train_final)
y_train_probab = cls._predict_proba_lr(X_train_final)[:,1]

y_test_pred = cls.predict(X_test_final)
y_test_probab = cls._predict_proba_lr(X_test_final)[:,1]

table.add_row(['Support Vector Machine', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])



In [13]:
# 5. K-Nearest Neighbors

cls = KNeighborsClassifier()
cls.fit(X_train_final, y_train)

y_train_pred = cls.predict(X_train_final)
y_train_probab = cls.predict_proba(X_train_final)[:,1]

y_test_pred = cls.predict(X_test_final)
y_test_probab = cls.predict_proba(X_test_final)[:,1]

table.add_row(['K-Nearest Neighbors', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])

In [14]:
# 6. Decision Tree

cls = DecisionTreeClassifier(criterion='gini', min_samples_split=3, random_state=42)
cls.fit(X_train_Final, y_train)

y_train_pred = cls.predict(X_train_Final)
y_train_probab = cls.predict_proba(X_train_Final)[:,1]

y_test_pred = cls.predict(X_test_Final)
y_test_probab = cls.predict_proba(X_test_Final)[:,1]

table.add_row(['Decision Tree', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])


In [15]:
# 7. Random Forest

cls = RandomForestClassifier(n_estimators=500,
                             max_depth=None,
                             min_samples_split=3,
                             n_jobs=1,
                             class_weight='balanced',
                             random_state=42)
cls.fit(X_train_Final, y_train)

y_train_pred = cls.predict(X_train_Final)
y_train_probab = cls.predict_proba(X_train_Final)[:,1]

y_test_pred = cls.predict(X_test_Final)
y_test_probab = cls.predict_proba(X_test_Final)[:,1]

table.add_row(['Random Forest', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])


In [16]:
# 8. XGBoost

cls = XGBClassifier(n_estimators=500,
                    max_depth=5,
                    learning_rate=0.15,
                    colsample_bytree=1,
                    subsample=1,
                    reg_alpha=0.3,
                    gamma=10,
                    n_jobs=2,
                    eval_metric='logloss',
                    use_label_encoder=False)

cls.fit(X_train_Final, y_train)

y_train_pred = cls.predict(X_train_Final)
y_train_probab = cls.predict_proba(X_train_Final)[:,1]

y_test_pred = cls.predict(X_test_Final)
y_test_probab = cls.predict_proba(X_test_Final)[:,1]

table.add_row(['XG Boost', log_loss(y_train, y_train_probab), log_loss(y_test,y_test_probab), roc_auc_score(y_train, y_train_probab), roc_auc_score(y_test, y_test_probab)])


In [17]:
print(table)

+------------------------+----------------------+---------------------+---------------------+--------------------+
|         Model          |    Train Log-loss    | Validation Log-Loss |      Train AUC      |   Validation AUC   |
+------------------------+----------------------+---------------------+---------------------+--------------------+
|         Random         |  1.0144442347521994  |  0.9942432390814736 | 0.49277414580838724 | 0.5141162352134407 |
|  Logistic Regression   | 0.20343928905281705  | 0.31541725743815563 |  0.975182280848196  | 0.9613029315960911 |
|      Naive Bayes       |  2.0139335434506673  |  2.521361516681172  |  0.9553053405169586 | 0.9353334476255786 |
| Support Vector Machine |  0.5782344521164527  |  0.5679838520005609 |  0.8519897090151582 | 0.8512669295388308 |
|  K-Nearest Neighbors   | 0.26583437688099887  |  1.6390116128831518 |  0.9478734865662863 |  0.86351105777473  |
|     Decision Tree      | 0.005761453546854756 |  3.5811343475729838 |  0.99996