In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Model Creating

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score

In [8]:
models = {
    'Logistic': LogisticRegression(),
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Ada Boost': AdaBoostClassifier(),
    'Gradient Boost': GradientBoostingClassifier(),
    'XG Boost': XGBClassifier()
}

In [10]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Train set performance
    train_as = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)

    # Test set performance
    test_as = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

Logistic
	-Train accuracy score:  0.8119266055045872
	-Train f1 score:  0.812785388127854
	-Train recall score:  0.8165137614678899 

	-Test accuracy score:  0.8571428571428571
	-Test f1 score:  0.9113924050632912
	-Test recall score:  0.9473684210526315
***************
SVC
	-Train accuracy score:  0.8944954128440367
	-Train f1 score:  0.8866995073891627
	-Train recall score:  0.8256880733944955 

	-Test accuracy score:  0.8979591836734694
	-Test f1 score:  0.9333333333333332
	-Test recall score:  0.9210526315789473
***************
Naive Bayes
	-Train accuracy score:  0.7660550458715596
	-Train f1 score:  0.7243243243243244
	-Train recall score:  0.6146788990825688 

	-Test accuracy score:  0.7755102040816326
	-Test f1 score:  0.8493150684931505
	-Test recall score:  0.8157894736842105
***************
KNN
	-Train accuracy score:  0.9541284403669725
	-Train f1 score:  0.9519230769230769
	-Train recall score:  0.908256880733945 

	-Test accuracy score:  0.9183673469387755
	-Test f1 score

# Hyperparameter tuning

In [78]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50],
    'p': [1, 2]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [80]:
random_cv_model = [
    ('KNN', KNeighborsClassifier(), param_grid_knn),
    ('Random Forest', RandomForestClassifier(), param_grid_rf),
    ('Gradient Boost', GradientBoostingClassifier(), param_grid_gb),
    ('XG Boost', XGBClassifier(), param_grid_xgb)
]

In [82]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name, model, param in random_cv_model:
    randomized = RandomizedSearchCV(estimator=model, param_distributions=param,
                                    n_iter=100,
                                    cv=3, 
                                    verbose=2,
                                    n_jobs=1)
    randomized.fit(X_train, y_train)
    model_params[name] = randomized.best_params_

    for name in model_params:
        print(f"---------- Best params of Model {name} ----------")
        print(model_params[name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END algorithm=ball_tree, leaf_size=30, n_neighbors=3, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, leaf_size=30, n_neighbors=3, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, leaf_size=30, n_neighbors=3, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=brute, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, leaf_size=10, n_neighbors=9, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, leaf_size=10, n_neighbors=9, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, leaf_size=10, n_neighbors=9, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=auto, leaf_size=10, n

In [84]:
models = {
    'KNN': KNeighborsClassifier(weights= 'uniform', p= 1, n_neighbors= 3, leaf_size= 30, algorithm= 'brute'),
    
    'Random Forest': RandomForestClassifier(n_estimators= 100, min_samples_split= 5, min_samples_leaf= 1,
                                            max_features= 'sqrt', max_depth= 30, criterion= 'gini', bootstrap= True),
    
    'Gradient Boost': GradientBoostingClassifier(subsample= 0.8, n_estimators= 200, min_samples_split= 10,
                                                 min_samples_leaf= 1, max_depth= 3, learning_rate= 0.2),
    
    'XG Boost': XGBClassifier(subsample= 0.8, n_estimators= 200, min_child_weight= 1,
                              max_depth= 3, learning_rate= 0.1, gamma= 0.1, colsample_bytree= 1.0)
}

In [86]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Train set performance
    train_as = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)

    # Test set performance
    test_as = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

KNN
	-Train accuracy score:  0.9724770642201835
	-Train f1 score:  0.9716981132075472
	-Train recall score:  0.944954128440367 

	-Test accuracy score:  0.9387755102040817
	-Test f1 score:  0.958904109589041
	-Test recall score:  0.9210526315789473
***************
Random Forest
	-Train accuracy score:  1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.9183673469387755
	-Test f1 score:  0.9487179487179489
	-Test recall score:  0.9736842105263158
***************
Gradient Boost
	-Train accuracy score:  1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.9183673469387755
	-Test f1 score:  0.9487179487179489
	-Test recall score:  0.9736842105263158
***************
XG Boost
	-Train accuracy score:  1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.9183673469387755
	-Test f1 score:  0.9487179487179489
	-Test recall score:  0.9736842105263158
***************


# Select KNN for the final prediction

In [91]:
knn = KNeighborsClassifier(weights= 'uniform', p= 1, n_neighbors= 3, leaf_size= 30, algorithm= 'brute')
knn.fit(X_train, y_train)
final_data = knn.predict(X_test)
final_data

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0])

In [95]:
final_df = pd.DataFrame(final_data, columns=['status'])
final_df.head()

Unnamed: 0,status
0,1
1,0
2,1
3,1
4,1


In [97]:
final_df.to_csv('final_status.csv', index=False)