<a href="https://colab.research.google.com/github/Requenamar3/datawrangling/blob/main/TemplateForBinaryClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from time import time
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/fenago/datasets/main/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
# the 2 basic rules of ML is:  all data must be numeric... and no empties
# plus all of the data wrangling

df.fillna(0, inplace=True)
print(df.nunique())

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64


In [None]:
# if the target (class in the case of mushrooms) is text/object... then we can make the target numeric with the LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
le = LabelEncoder()

# Fit and transform the binary column
df['y'] = le.fit_transform(df['y'])

# Print the mapping
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)

{'no': 0, 'yes': 1}


In [None]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [None]:
X = df.drop(['y'], axis=1)

In [None]:
y = df['y']

In [None]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9043 entries, 3776 to 11677
Data columns (total 51 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  9043 non-null   int64
 1   balance              9043 non-null   int64
 2   day                  9043 non-null   int64
 3   duration             9043 non-null   int64
 4   campaign             9043 non-null   int64
 5   pdays                9043 non-null   int64
 6   previous             9043 non-null   int64
 7   job_admin.           9043 non-null   uint8
 8   job_blue-collar      9043 non-null   uint8
 9   job_entrepreneur     9043 non-null   uint8
 10  job_housemaid        9043 non-null   uint8
 11  job_management       9043 non-null   uint8
 12  job_retired          9043 non-null   uint8
 13  job_self-employed    9043 non-null   uint8
 14  job_services         9043 non-null   uint8
 15  job_student          9043 non-null   uint8
 16  job_technician      

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('SGD', SGDClassifier()))
models.append(('Ridge', RidgeClassifier()))
models.append(('PAC', PassiveAggressiveClassifier()))
models.append(('Perceptron', Perceptron()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NearestCentroid', NearestCentroid()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('ExtraTree', ExtraTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('BNB', BernoulliNB()))
# models.append(('ComplementNB', ComplementNB()))
# models.append(('MultinomialNB', MultinomialNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('NuSVC', NuSVC()))
models.append(('LinearSVC', LinearSVC()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ExtraTrees', ExtraTreesClassifier()))
models.append(('Bagging', BaggingClassifier()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('MLP', MLPClassifier()))
models.append(('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')))

In [None]:
# Scoring metrics:
# 'accuracy', 'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted',
# 'neg_log_loss', 'precision', 'recall', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted'
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    start = time()
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    model.fit(X_train, y_train)
    train_time = time() - start
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    predict_time = time() - start
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print("Score for each of the 10 K-fold tests: ",cv_results)
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print()

fig = pyplot.figure(figsize=(15, 10))  # Change the numbers as needed
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names, rotation = 45)  # Added rotation for better visibility
pyplot.show()

LR: 0.901709 (0.003658)
Score for each of the 10 K-fold tests:  [0.90489356 0.89687586 0.90295825 0.90572297 0.90710534 0.89881117
 0.8954935  0.90185236 0.90348451 0.89988938]
LogisticRegression(solver='liblinear')
	Training time: 0.810s
	Prediction time: 8.504s

SGD: 0.750599 (0.205043)
Score for each of the 10 K-fold tests:  [0.88139342 0.86480509 0.88968759 0.87973459 0.72103954 0.38263754
 0.84932264 0.85540503 0.86144912 0.32051991]
SGDClassifier()
	Training time: 0.578s
	Prediction time: 7.524s

Ridge: 0.900216 (0.004240)
Score for each of the 10 K-fold tests:  [0.90047    0.89494056 0.90212884 0.90489356 0.90572297 0.89825823
 0.89134642 0.90240531 0.90293142 0.89905973]
RidgeClassifier()
	Training time: 0.066s
	Prediction time: 1.028s

PAC: 0.849980 (0.060019)
Score for each of the 10 K-fold tests:  [0.88581698 0.74564556 0.84959912 0.87696986 0.88941111 0.88692286
 0.8786287  0.7193807  0.88882743 0.87859513]
PassiveAggressiveClassifier()
	Training time: 0.409s
	Prediction ti



QDA: 0.872788 (0.006786)
Score for each of the 10 K-fold tests:  [0.8656345  0.86342273 0.87973459 0.86978159 0.8855405  0.88111695
 0.86839923 0.87005806 0.87361726 0.87057522]
QuadraticDiscriminantAnalysis()
	Training time: 0.213s
	Prediction time: 2.081s

KNN: 0.882659 (0.004588)
Score for each of the 10 K-fold tests:  [0.88084048 0.87779928 0.89189936 0.88471109 0.88941111 0.87669339
 0.880564   0.88332873 0.88053097 0.88080752]
KNeighborsClassifier()
	Training time: 0.017s
	Prediction time: 32.140s

NearestCentroid: 0.736203 (0.005223)
Score for each of the 10 K-fold tests:  [0.73513962 0.73265137 0.74094553 0.73679845 0.7442632  0.73901023
 0.72822781 0.72850429 0.73423673 0.74225664]
NearestCentroid()
	Training time: 0.023s
	Prediction time: 0.597s

CART: 0.874115 (0.005293)
Score for each of the 10 K-fold tests:  [0.87144042 0.87254631 0.87254631 0.87669339 0.88277578 0.8786287
 0.86508156 0.87641692 0.87859513 0.86642699]
DecisionTreeClassifier()
	Training time: 0.667s
	Predic