**Implement the 10 most important binary classification algorithms & check their performance**

* Naive Bayes
* Logistic Regression
* K-Nearest Neighbours
* Support Vector Machine
* Decision Tree
* Bagging Decision Tree (Ensemble Learning I)
* Boosted Decision Tree (Ensemble Learning II)
* Random Forest (Ensemble Learning III)
* Voting Classification (Ensemble Learning IV)
* Deep Learning with a neuronal network

### Standard Libraries

In [None]:
# standard libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os,warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import datetime
Table = []

### Load Data

In [None]:
# paths
TRAIN_PATH = '/kaggle/input/playground-series-s3e12/train.csv'
TEST_PATH = '/kaggle/input/playground-series-s3e12/test.csv'
SUBMISSION_PATH = '/kaggle/input/playground-series-s3e12/sample_submission.csv'

# read data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
submit_df = pd.read_csv(SUBMISSION_PATH)

# drop column = id
train_df.drop('id',axis=1,inplace=True)
test_df.drop('id',axis=1,inplace=True)

# drop NULLs
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# reset index
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

# view
print(f"Training data shape: {train_df.shape}")

In [None]:
# feature engineering
X = train_df.iloc[:, : 6]
Y = train_df[['target']]

# split data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)
print(f"x_train shape: {x_train.shape} | x_test shape: {x_test.shape}")

### Naive Bayes

In [None]:
%%time

# import the library
from sklearn.naive_bayes import MultinomialNB

# instantiate & fit
mnb = MultinomialNB().fit(x_train, y_train)
print("score on test: " + str(mnb.score(x_test, y_test)))
Table.append(['Naive Bayes', mnb.score(x_test, y_test)])

### Logistic Regression


In [None]:
%%time

# import the library
from sklearn.linear_model import LogisticRegression

# instantiate & fit
lr=LogisticRegression(max_iter=5000)
lr.fit(x_train, y_train)
print("score on test: " + str(lr.score(x_test, y_test)))
Table.append(['Logistic Regression', lr.score(x_test, y_test)])

In [None]:
%%time

# import the library
from sklearn.linear_model import SGDClassifier

# instantiate & fit
sgd=SGDClassifier()
sgd.fit(x_train, y_train)
print("score on test: " + str(sgd.score(x_test, y_test)))
Table.append(['SGDClassifier', sgd.score(x_test, y_test)])

### K-Nearest Neighbours

In [None]:
%%time

# import the library
from sklearn.neighbors import KNeighborsClassifier

# instantiate & fit
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
knn.fit(x_train, y_train)
print("score on test: " + str(knn.score(x_test, y_test)))
Table.append(['KNN', knn.score(x_test, y_test)])

### Support Vector Machine

In [None]:
%%time

# import the library
from sklearn.svm import LinearSVC

# instantiate & fit
svm=LinearSVC(C=0.0001)
svm.fit(x_train, y_train)
print("score on test: " + str(svm.score(x_test, y_test)))
Table.append(['SVM', svm.score(x_test, y_test)])

### Decision Tree

In [None]:
%%time

# import the library
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
clf = DecisionTreeClassifier(min_samples_split=10,max_depth=3)
clf.fit(x_train, y_train)
print("score on test: "  + str(clf.score(x_test, y_test)))
Table.append(['Decision Tree', clf.score(x_test, y_test)])

### Bagging Decision Tree

In [None]:
%%time

# import the library
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
bg=BaggingClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=3),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(x_train, y_train)
print("score on test: " + str(bg.score(x_test, y_test)))
Table.append(['Bagging Decision Tree', bg.score(x_test, y_test)])

### Boosting Decision Tree

In [None]:
%%time

# import the library
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=100,learning_rate=0.5)
adb.fit(x_train, y_train)
print("score on test: " + str(adb.score(x_test, y_test)))
Table.append(['AdaBoost Classifier', adb.score(x_test, y_test)])

In [None]:
%%time

# import the library
from sklearn.ensemble import GradientBoostingClassifier

# instantiate & fit
gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(x_train, y_train)
print("score on test: " + str(gbc.score(x_test, y_test)))
Table.append(['Gradient Boost Classifier', gbc.score(x_test, y_test)])

### Random Forest

In [None]:
%%time

# import the library
from sklearn.ensemble import RandomForestClassifier

# instantiate & fit
rf = RandomForestClassifier(n_estimators=300,max_depth=3)
rf.fit(x_train, y_train)
print("score on test: " + str(rf.score(x_test, y_test)))
Table.append(['Random Forest', rf.score(x_test, y_test)])

### Voting Classifier

In [None]:
%%time

# import the library
from sklearn.ensemble import VotingClassifier

# 1) naive bias = mnb
mnb = MultinomialNB().fit(x_train, y_train)
# 2) logistic regression =lr
lr=LogisticRegression(max_iter=5000)
# 3) random forest =rf
rf = RandomForestClassifier(n_estimators=30,max_depth=3)
# 4) suport vecotr mnachine = svm
svm=LinearSVC(max_iter=5000)
evc=VotingClassifier(estimators=[('mnb',mnb),('lr',lr),('rf',rf),('svm',svm)])
evc.fit(x_train, y_train)

print("score on test: " + str(evc.score(x_test, y_test)))
Table.append(['Voting Classifier', evc.score(x_test, y_test)])


### Deep Learning

In [None]:
%%time

# import the library
from keras import layers
from keras import models
from keras import optimizers
from keras import losses
from keras import regularizers
from keras import metrics
from tqdm.keras import TqdmCallback

# add validation dataset
validation_split=100
x_validation=X[:validation_split]
x_partial_train=X[validation_split:]
y_validation=Y[:validation_split]
y_partial_train=Y[validation_split:]

# build & compile model
model=models.Sequential()
model.add(layers.Dense(4,kernel_regularizer=regularizers.l2(0.003),activation='relu',input_shape=(6,)))
model.add(layers.Dropout(0.7))
model.add(layers.Dense(4,kernel_regularizer=regularizers.l2(0.003),activation='relu'))
model.add(layers.Dropout(0.7))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

# fir the model
model.fit(x_partial_train,y_partial_train,epochs=100,batch_size=512,validation_data=(x_validation,y_validation), callbacks=[TqdmCallback(verbose=0)],verbose=0)

print('')
print("score on test: " + str(model.evaluate(x_test,y_test)[1]))
Table.append(['Neural Network', model.evaluate(x_test,y_test)[1]])

In [None]:
# view
print(tabulate(Table, headers=["Model","Score"], tablefmt='fancy_outline') )

**Decision Tree Classifer seems to be giving the best result.**

### Fine-Tune Decision Tree Classifier

In [None]:
# fine-tuning the Decision Tree Classifier
from sklearn.model_selection import GridSearchCV

# instantiate
dtc = DecisionTreeClassifier()

# define paramter grid.
param_grid = [{'min_samples_split': [5, 10, 15, 20], 'max_depth': [3, 6, 9, 12]}]

# define grid-search
grid_search = GridSearchCV(estimator=dtc,
                          param_grid=param_grid,
                          scoring="top_k_accuracy",
                          cv=5,
                          return_train_score=True)

# fit the grid search
grid_search.fit(x_train, y_train)

In [None]:
# get the best estimator
dtc_tuned = grid_search.best_estimator_

# fit the estimator
dtc_tuned.fit(x_train, y_train)
print("score on test: "  + str(dtc_tuned.score(x_test, y_test)))

In [None]:
# make prediction on TEST data
result = dtc_tuned.predict(test_df)

# add the prediction to the submission
submit_df['target'] = result

# get the datestamp
datestamp = '{:%Y_%m_%d}'.format(datetime.date.today())

# save the submission
submit_df.to_csv(str(datestamp)+"_submission.csv",index=False)