In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Model Creating

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, recall_score, f1_score

In [9]:
models = {
    'Logistic': LogisticRegression(),
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Ada Boost': AdaBoostClassifier(),
    'Gradient Boost': GradientBoostingClassifier(),
    'XG Boost': XGBClassifier()
}

In [12]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Train data performance
    train_as = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)

    #Test data performance
    test_as = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

Logistic
	-Train accuracy score:  0.9589552238805971
	-Train f1 score:  0.9588014981273407
	-Train recall score:  0.9552238805970149 

	-Test accuracy score:  0.965034965034965
	-Test f1 score:  0.954954954954955
	-Test recall score:  0.9814814814814815
***************
SVC
	-Train accuracy score:  0.9794776119402985
	-Train f1 score:  0.9793621013133209
	-Train recall score:  0.9738805970149254 

	-Test accuracy score:  0.9300699300699301
	-Test f1 score:  0.9074074074074074
	-Test recall score:  0.9074074074074074
***************
Naive Bayes
	-Train accuracy score:  0.9197761194029851
	-Train f1 score:  0.9196261682242992
	-Train recall score:  0.917910447761194 

	-Test accuracy score:  0.8951048951048951
	-Test f1 score:  0.8648648648648649
	-Test recall score:  0.8888888888888888
***************
KNN
	-Train accuracy score:  0.9757462686567164
	-Train f1 score:  0.9758812615955472
	-Train recall score:  0.9813432835820896 

	-Test accuracy score:  0.9090909090909091
	-Test f1 score:

# Hyperparameter tuning

In [15]:
svc_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'coef0': [0.0, 0.1, 0.5, 1.0],
    'shrinking': [True, False],
    'probability': [True, False],
    'tol': [1e-3, 1e-4, 1e-5],
    'class_weight': [None, 'balanced']
}


rf_params = {
    'n_estimators': [50, 100, 200, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}


adaboost_params = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}


gb_params = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.5, 0.7, 1.0],
    'max_features': ['sqrt', 'log2', None]
}


xgb_params = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10, 20],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

In [19]:
random_cv_model = [
    ('SVC', SVC(), svc_params),
    ('Random Forest', RandomForestClassifier(), rf_params),
    ('Ada Boost', AdaBoostClassifier(), adaboost_params),
    ('Gradient Boost', GradientBoostingClassifier(), gb_params),
    ('XG Boost', XGBClassifier(), xgb_params)
]

In [21]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name, model, params in random_cv_model:
    randomize = RandomizedSearchCV(estimator=model, param_distributions=params,
                                   n_iter=100,
                                   cv=3, 
                                   verbose=2, 
                                   n_jobs=1)
    randomize.fit(X_train, y_train)
    model_params[name] = randomize.best_params_

    for name in model_params:
        print(f"--------- Best params of model {name} ----------")
        print(model_params[name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END C=10, class_weight=balanced, coef0=0.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=10, class_weight=balanced, coef0=0.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=10, class_weight=balanced, coef0=0.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=10, class_weight=balanced, coef0=1.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=10, class_weight=balanced, coef0=1.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=10, class_weight=balanced, coef0=1.0, degree=3, gamma=auto, kernel=poly, probability=False, shrinking=False, tol=1e-05; total time=   0.0s
[CV] END C=0.1, class_weight=balanced

In [25]:
models = {
    'SVC': SVC(tol= 0.0001, shrinking= False, probability= True, kernel= 'rbf', gamma= 'auto', degree= 3,
               coef0= 0.1, class_weight= None, C= 1),
    
    'Random Forest': RandomForestClassifier(n_estimators= 500, min_samples_split= 5, min_samples_leaf= 1,
                                            max_features= 'log2', max_depth= 30, criterion= 'entropy', bootstrap= True),

    'Ada Boost': AdaBoostClassifier(n_estimators= 200, learning_rate= 0.1, algorithm= 'SAMME.R'),

    'Gradient Boost': GradientBoostingClassifier(subsample= 0.5, n_estimators= 50, min_samples_split= 2, min_samples_leaf= 1,
                                                 max_features= 'sqrt', max_depth= 10, learning_rate= 0.001),

    'XG Boost': XGBClassifier(subsample= 0.7, reg_lambda= 0, reg_alpha= 0.1, n_estimators= 200, min_child_weight= 1,
                              max_depth= 10, learning_rate= 0.2, gamma= 0.1, colsample_bytree= 0.7)
}

In [27]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Train data performance
    train_as = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)

    #Test data performance
    test_as = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)

    print(list(models.keys())[i])
    print('\t-Train accuracy score: ', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

SVC
	-Train accuracy score:  0.9794776119402985
	-Train f1 score:  0.9793621013133209
	-Train recall score:  0.9738805970149254 

	-Test accuracy score:  0.9300699300699301
	-Test f1 score:  0.9074074074074074
	-Test recall score:  0.9074074074074074
***************
Random Forest
	-Train accuracy score:  0.9981343283582089
	-Train f1 score:  0.9981308411214954
	-Train recall score:  0.996268656716418 

	-Test accuracy score:  0.9440559440559441
	-Test f1 score:  0.9285714285714286
	-Test recall score:  0.9629629629629629
***************
Ada Boost
	-Train accuracy score:  0.996268656716418
	-Train f1 score:  0.9962546816479401
	-Train recall score:  0.9925373134328358 

	-Test accuracy score:  0.9300699300699301
	-Test f1 score:  0.9074074074074074
	-Test recall score:  0.9074074074074074
***************
Gradient Boost
	-Train accuracy score:  1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.951048951048951
	-Test f1 score:  0.9369369369369369
	-Test rec

# Selecting 'Gradient Boosting' as final model

In [32]:
gradient = GradientBoostingClassifier(subsample= 0.5, n_estimators= 50, min_samples_split= 2, min_samples_leaf= 1,
                                                 max_features= 'sqrt', max_depth= 10, learning_rate= 0.001)
gradient.fit(X_train, y_train)
prediction = gradient.predict(X_test)
prediction

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0])

In [36]:
predicted_df = pd.DataFrame(prediction, columns=['Diagnosis'])

In [42]:
predicted_df['Diagnosis'] = np.where(predicted_df['Diagnosis']==0, 'B', 'M')

# Save the Final Dataframe

In [48]:
predicted_df.to_csv('Final_Predicted.csv', index=False)