# Machine Learning - Assignment 1
# Data Analysis and model Evaluation

In [1]:
import numpy as np
from Evaluate import *
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, LabelEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

### Dataset #1: Qualitative Bankruptcy

link - https://archive.ics.uci.edu/ml/datasets/Qualitative_Bankruptcy

In [2]:
data_name = 'Qualitative_Bankruptcy'
file_path = os.path.join(BASE_DATA_PATH, data_name)
df = pd.read_csv(f'{file_path}.txt', sep=',', header=None)
# Add column names from the data description
df.columns = ["Industrial Risk", "Management Risk", "Financial Flexibility", "Credibility",
                    "Competitiveness", "Operating Risk", "Class"]
df.head()

Unnamed: 0,Industrial Risk,Management Risk,Financial Flexibility,Credibility,Competitiveness,Operating Risk,Class
0,P,P,A,A,A,P,NB
1,N,N,A,A,A,N,NB
2,A,A,A,A,A,A,NB
3,P,P,P,P,P,P,NB
4,N,N,P,P,P,N,NB


In [3]:
# validate no NA values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Industrial Risk        250 non-null    object
 1   Management Risk        250 non-null    object
 2   Financial Flexibility  250 non-null    object
 3   Credibility            250 non-null    object
 4   Competitiveness        250 non-null    object
 5   Operating Risk         250 non-null    object
 6   Class                  250 non-null    object
dtypes: object(7)
memory usage: 13.8+ KB


Attribute Information: (P=Positive,A-Average,N-negative,B-Bankruptcy,NB-Non-Bankruptcy) 

    1. Industrial Risk: {P,A,N}
    2. Management Risk: {P,A,N}
    3. Financial Flexibility: {P,A,N}
    4. Credibility: {P,A,N}
    5. Competitiveness: {P,A,N}
    6. Operating Risk: {P,A,N}
    7. Class: {B,NB}

Check data distribution to decide how to transform the features to binary data

In [4]:
for col in df.columns:
  print(df[col].value_counts(normalize=True))

N    0.356
A    0.324
P    0.320
Name: Industrial Risk, dtype: float64
N    0.476
A    0.276
P    0.248
Name: Management Risk, dtype: float64
N    0.476
A    0.296
P    0.228
Name: Financial Flexibility, dtype: float64
N    0.376
P    0.316
A    0.308
Name: Credibility, dtype: float64
N    0.412
P    0.364
A    0.224
Name: Competitiveness, dtype: float64
N    0.456
P    0.316
A    0.228
Name: Operating Risk, dtype: float64
NB    0.572
B     0.428
Name: Class, dtype: float64


Based on the value counts, we'll merge P & A values so the attributes will be binary and P & N will be converted to 1 & 0.

in the target column B is 1 and NB is 0 (model attempts to predict Bankruptcy so that will be the positive class) 

In [5]:
df.replace(['A', 'P', 'B'], 1, inplace=True)
df.replace(['N', 'NB'], 0, inplace=True)

In [6]:
X = df.drop('Class', axis=1)
y = df['Class']

In [7]:
evaluate(X, y, data_name, repetitions=2, n_folds=5, sync=True)

Starting evaluation process on MyID3


[34m[1mwandb[0m: Currently logged in as: [33mgindes[0m ([33mmachinelearning_37225214[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016677521516673247, max=1.0…



0,1
Accuracy,0.992
F1-score,0.991
Precision,0.982
ROC_AUC,0.993
Recall,1.0
fit_time,68.496


Starting evaluation process on DecisionTreeClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669682999994013, max=1.0…

0,1
Accuracy,0.996
F1-score,0.995
Precision,0.991
ROC_AUC,0.996
Recall,1.0
fit_time,82.506


Starting evaluation process on MyBaggingID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669201716680012, max=1.0…

0,1
Accuracy,0.968
F1-score,0.962
Precision,0.972
ROC_AUC,0.966
Recall,0.954
fit_time,8893.766


Starting evaluation process on BaggingClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016679638216677024, max=1.0…

0,1
Accuracy,0.974
F1-score,0.969
Precision,0.983
ROC_AUC,0.972
Recall,0.958
fit_time,1509.621


{'MyID3': {'fit_time': 68.49648952484131,
  'score_time': 5.018115043640137,
  'test_Accuracy': 0.992,
  'test_Precision': 0.982213438735178,
  'test_Recall': 1.0,
  'test_F1-score': 0.9909043927648579,
  'test_ROC_AUC': 0.9929802955665025},
 'DecisionTreeClassifier': {'fit_time': 82.50565528869629,
  'score_time': 5.48248291015625,
  'test_Accuracy': 0.9960000000000001,
  'test_Precision': 0.9911067193675891,
  'test_Recall': 1.0,
  'test_F1-score': 0.995452196382429,
  'test_ROC_AUC': 0.9964901477832513},
 'MyBaggingID3': {'fit_time': 8893.765568733215,
  'score_time': 9.97316837310791,
  'test_Accuracy': 0.968,
  'test_Precision': 0.9718614718614719,
  'test_Recall': 0.9536796536796537,
  'test_F1-score': 0.9623399950399383,
  'test_ROC_AUC': 0.9663718465442603},
 'BaggingClassifier': {'fit_time': 1509.6213102340698,
  'score_time': 7.3172807693481445,
  'test_Accuracy': 0.974,
  'test_Precision': 0.982608695652174,
  'test_Recall': 0.9577922077922079,
  'test_F1-score': 0.968693027

### Dataset #2: Divorce Predictors

link - https://archive.ics.uci.edu/ml/datasets/Divorce+Predictors+data+set

In [8]:
data_name = "Divorce Predictors"
file_path = os.path.join(BASE_DATA_PATH, 'divorce')
divorce_data = pd.read_csv(f'{file_path}.csv', delimiter=';', header=0)

In [9]:
divorce_data.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [10]:
# Preprocess divorce_data
X = divorce_data.drop('Class', axis=1)
y = divorce_data['Class']

In [11]:
# Verify target is binary
np.unique(y)

array([0, 1])

In [12]:
# Each feature in X is a statement response between 0 and 4
# We'll discretize the answers into two bins, 0 = [0,1] and 1 = [2,3,4]
est = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform').fit(X)
X_transformed = est.transform(X)
np.unique(X_transformed)

array([0., 1.])

In [13]:
evaluate(X_transformed, y, data_name, repetitions=2, n_folds=5, sync=True)

Starting evaluation process on MyID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670365416666756, max=1.0…

0,1
Accuracy,0.979
F1-score,0.979
Precision,0.983
ROC_AUC,0.979
Recall,0.976
fit_time,242.794


Starting evaluation process on DecisionTreeClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669214833321653, max=1.0…

0,1
Accuracy,0.976
F1-score,0.976
Precision,0.972
ROC_AUC,0.976
Recall,0.982
fit_time,86.65


Starting evaluation process on MyBaggingID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670379883332014, max=1.0…

0,1
Accuracy,0.976
F1-score,0.975
Precision,1.0
ROC_AUC,0.976
Recall,0.952
fit_time,28021.287


Starting evaluation process on BaggingClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666968036667337, max=1.0)…

0,1
Accuracy,0.974
F1-score,0.972
Precision,1.0
ROC_AUC,0.973
Recall,0.946
fit_time,1367.178


{'MyID3': {'fit_time': 242.79444217681885,
  'score_time': 5.869317054748535,
  'test_Accuracy': 0.9794117647058824,
  'test_Precision': 0.9833333333333334,
  'test_Recall': 0.9761029411764707,
  'test_F1-score': 0.9792850160592096,
  'test_ROC_AUC': 0.9792279411764706},
 'DecisionTreeClassifier': {'fit_time': 86.6495132446289,
  'score_time': 9.128451347351074,
  'test_Accuracy': 0.9764705882352942,
  'test_Precision': 0.9722222222222221,
  'test_Recall': 0.9816176470588236,
  'test_F1-score': 0.976017316017316,
  'test_ROC_AUC': 0.9761029411764707},
 'MyBaggingID3': {'fit_time': 28021.28701210022,
  'score_time': 8.422374725341797,
  'test_Accuracy': 0.9764705882352942,
  'test_Precision': 1.0,
  'test_Recall': 0.9522058823529411,
  'test_F1-score': 0.9747727272727273,
  'test_ROC_AUC': 0.9761029411764707},
 'BaggingClassifier': {'fit_time': 1367.1783924102783,
  'score_time': 5.869603157043457,
  'test_Accuracy': 0.973529411764706,
  'test_Precision': 1.0,
  'test_Recall': 0.9463235

### Dataset #3: breast cancer
link - https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [14]:
data_name = 'Breast Cancer Wisconsin'
file_path = os.path.join(BASE_DATA_PATH, 'breast_cancer')
df = pd.read_csv(f'{file_path}.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [15]:
# drop columns that are highly correlated to one another & unneccery cols like ID and Unnamed: 32
to_drop = ['id', 'diagnosis', 'Unnamed: 32','perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
X = df.drop(to_drop, axis=1)
# drop rows with missing detailes 
X.dropna()
X

Unnamed: 0,texture_mean,area_mean,smoothness_mean,concavity_mean,symmetry_mean,fractal_dimension_mean,texture_se,area_se,smoothness_se,concavity_se,symmetry_se,fractal_dimension_se,smoothness_worst,concavity_worst,symmetry_worst,fractal_dimension_worst
0,10.38,1001.0,0.11840,0.30010,0.2419,0.07871,0.9053,153.40,0.006399,0.05373,0.03003,0.006193,0.16220,0.7119,0.4601,0.11890
1,17.77,1326.0,0.08474,0.08690,0.1812,0.05667,0.7339,74.08,0.005225,0.01860,0.01389,0.003532,0.12380,0.2416,0.2750,0.08902
2,21.25,1203.0,0.10960,0.19740,0.2069,0.05999,0.7869,94.03,0.006150,0.03832,0.02250,0.004571,0.14440,0.4504,0.3613,0.08758
3,20.38,386.1,0.14250,0.24140,0.2597,0.09744,1.1560,27.23,0.009110,0.05661,0.05963,0.009208,0.20980,0.6869,0.6638,0.17300
4,14.34,1297.0,0.10030,0.19800,0.1809,0.05883,0.7813,94.44,0.011490,0.05688,0.01756,0.005115,0.13740,0.4000,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,22.39,1479.0,0.11100,0.24390,0.1726,0.05623,1.2560,158.70,0.010300,0.05198,0.01114,0.004239,0.14100,0.4107,0.2060,0.07115
565,28.25,1261.0,0.09780,0.14400,0.1752,0.05533,2.4630,99.04,0.005769,0.03950,0.01898,0.002498,0.11660,0.3215,0.2572,0.06637
566,28.08,858.1,0.08455,0.09251,0.1590,0.05648,1.0750,48.55,0.005903,0.04730,0.01318,0.003892,0.11390,0.3403,0.2218,0.07820
567,29.33,1265.0,0.11780,0.35140,0.2397,0.07016,1.5950,86.22,0.006522,0.07117,0.02324,0.006185,0.16500,0.9387,0.4087,0.12400


In [16]:
# y is the target - use label encoder to turn in to binary
y = df['diagnosis']
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
# Verify target variable is binary as required
np.unique(y, return_counts=True)

(array([0, 1]), array([357, 212]))

In [18]:
X.columns

Index(['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'area_se',
       'smoothness_se', 'concavity_se', 'symmetry_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst',
       'fractal_dimension_worst'],
      dtype='object')

In [19]:
# Define the columns to one-hot encode and discretize
# in this case all of the columns need to be discretize to multiple bins
# followed by oneHot encoding to make each feature binary
print(f'Number of columns prior to discratization and OneHot encoding: {X.shape[1]}')
cat_cols = []
cont_cols = ['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'area_se',
       'smoothness_se', 'concavity_se', 'symmetry_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst',
       'fractal_dimension_worst']
X = preprocess(X,cat_cols,cont_cols, n_bins=5)
print(f'Number of columns after to discratization and OneHot encoding: {X.shape[1]}')

Number of columns prior to discratization and OneHot encoding: 16
Number of columns after to discratization and OneHot encoding: 80


we see that we have no categorial features but all the features needs to be discretize - because we are using 5 bins we expect some loss of information

In [20]:
evaluate(X, y, data_name, repetitions=2, n_folds=5, sync=True)

Starting evaluation process on MyID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016677838750001683, max=1.0…

0,1
Accuracy,0.885
F1-score,0.847
Precision,0.843
ROC_AUC,0.879
Recall,0.854
fit_time,2404.414


Starting evaluation process on DecisionTreeClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016678638466661747, max=1.0…

0,1
Accuracy,0.881
F1-score,0.843
Precision,0.838
ROC_AUC,0.875
Recall,0.854
fit_time,81.181


Starting evaluation process on MyBaggingID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670580966668545, max=1.0…

0,1
Accuracy,0.376
F1-score,0.543
Precision,0.373
ROC_AUC,0.501
Recall,0.993
fit_time,508550.036


Starting evaluation process on BaggingClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016679124833353855, max=1.0…

0,1
Accuracy,0.914
F1-score,0.882
Precision,0.904
ROC_AUC,0.904
Recall,0.863
fit_time,1300.566


{'MyID3': {'fit_time': 2404.4142961502075,
  'score_time': 4.748249053955078,
  'test_Accuracy': 0.8849091755938518,
  'test_Precision': 0.8427180440789679,
  'test_Recall': 0.8538205980066443,
  'test_F1-score': 0.8472383679367421,
  'test_ROC_AUC': 0.8786219578452628},
 'DecisionTreeClassifier': {'fit_time': 81.18093013763428,
  'score_time': 6.189322471618652,
  'test_Accuracy': 0.8805154479118149,
  'test_Precision': 0.838299951352069,
  'test_Recall': 0.8540420819490586,
  'test_F1-score': 0.842672088852607,
  'test_ROC_AUC': 0.8752506966865793},
 'MyBaggingID3': {'fit_time': 508550.03576278687,
  'score_time': 20.697498321533203,
  'test_Accuracy': 0.376090669150753,
  'test_Precision': 0.3732761106305587,
  'test_Recall': 0.9928571428571429,
  'test_F1-score': 0.542512679235213,
  'test_ROC_AUC': 0.5012896825396825},
 'BaggingClassifier': {'fit_time': 1300.5659818649292,
  'score_time': 5.262255668640137,
  'test_Accuracy': 0.9139108834031981,
  'test_Precision': 0.9040082727150

### Dataset #4: Heart Disease Cleveland
link - https://www.kaggle.com/datasets/cherngs/heart-disease-cleveland-uci

In [21]:
# read and clean data
data_name = 'Heart Disease Cleveland'
file_path = os.path.join(BASE_DATA_PATH, 'heart_cleveland')
df = pd.read_csv(f'{file_path}.csv')
df = df.dropna()
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [22]:
X = df.drop(['condition'], axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0


In [23]:
y = df['condition']
y.value_counts()

0    160
1    137
Name: condition, dtype: int64

In [24]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [25]:
# Define the columns to one-hot encode and discretize
cat_cols = ["cp", "restecg",'slope','ca','thal']
cont_cols = ["age", "trestbps",'chol','thalach','oldpeak']
X = preprocess(X,cat_cols,cont_cols)

In [26]:
np.unique(X), X.shape

(array([0., 1.]), (297, 27))

we see that we have categorial features that need to be encode and numeric  features that needs to be discretize - because we r using 2 bins we expect a loss of information

In [27]:
evaluate(X, y, data_name, repetitions=2, n_folds=5, sync=True)

Starting evaluation process on MyID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016678211616696595, max=1.0…

0,1
Accuracy,0.818
F1-score,0.795
Precision,0.83
ROC_AUC,0.814
Recall,0.766
fit_time,751.318


Starting evaluation process on DecisionTreeClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016676966250027667, max=1.0…

0,1
Accuracy,0.818
F1-score,0.797
Precision,0.825
ROC_AUC,0.815
Recall,0.774
fit_time,43.86


Starting evaluation process on MyBaggingID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667774076668138, max=1.0)…

0,1
Accuracy,0.47
F1-score,0.613
Precision,0.466
ROC_AUC,0.502
Recall,0.913
fit_time,170244.506


Starting evaluation process on BaggingClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666956293332381, max=1.0)…

0,1
Accuracy,0.802
F1-score,0.781
Precision,0.796
ROC_AUC,0.8
Recall,0.774
fit_time,1213.828


{'MyID3': {'fit_time': 751.3178825378418,
  'score_time': 4.89804744720459,
  'test_Accuracy': 0.8182768361581919,
  'test_Precision': 0.8304026356784977,
  'test_Recall': 0.7664021164021164,
  'test_F1-score': 0.7953044500462574,
  'test_ROC_AUC': 0.8144510582010582},
 'DecisionTreeClassifier': {'fit_time': 43.85967254638672,
  'score_time': 4.72712516784668,
  'test_Accuracy': 0.8182768361581921,
  'test_Precision': 0.8246370679129299,
  'test_Recall': 0.7738095238095238,
  'test_F1-score': 0.7971393059044207,
  'test_ROC_AUC': 0.8150297619047618},
 'MyBaggingID3': {'fit_time': 170244.5059299469,
  'score_time': 13.631677627563477,
  'test_Accuracy': 0.4695762711864406,
  'test_Precision': 0.46587162342462474,
  'test_Recall': 0.9126984126984127,
  'test_F1-score': 0.6133560897901983,
  'test_ROC_AUC': 0.5016617063492064},
 'BaggingClassifier': {'fit_time': 1213.827896118164,
  'score_time': 5.691409111022949,
  'test_Accuracy': 0.8016101694915255,
  'test_Precision': 0.7956249367984

### Dataset #5: titanic
link - https://www.kaggle.com/competitions/titanic/data?select=train.csv

In [28]:
data_name = 'Titanic Survival Predictors'
file_path = os.path.join(BASE_DATA_PATH, 'titanic')
df = pd.read_csv(f'{file_path}.csv')
# Drop columns that have mostly unique values and don't provide much information
df = df.drop(['Name','Ticket','PassengerId','Cabin'], axis=1)

# Remove rows with missing values
df = df.dropna()
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [29]:
X = df.drop(['Survived'], axis=1)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
885,3,female,39.0,0,5,29.1250,Q
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
889,1,male,26.0,0,0,30.0000,C


In [30]:
y = df['Survived']
# Verify target is binary
y.unique()

array([0, 1])

In [31]:
# Define the columns to one-hot encode and discretize
cat_cols = ["Sex", "Embarked",'Pclass','SibSp','Parch']
cont_cols = ["Age", "Fare"]
X = preprocess(X,cat_cols,cont_cols)

In [32]:
X.shape

(712, 25)

After oneHot encoding the data is left with 25 features which represent binary features, each one representing possible values of the previous non-binary features

In [33]:
# Verify binary features
np.unique(X)

array([0., 1.])

In [34]:
evaluate(X, y, data_name, repetitions=2, n_folds=5, sync=True)

Starting evaluation process on MyID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667616943335209, max=1.0)…

0,1
Accuracy,0.768
F1-score,0.684
Precision,0.785
ROC_AUC,0.744
Recall,0.625
fit_time,1327.953


Starting evaluation process on DecisionTreeClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016671827866654592, max=1.0…

0,1
Accuracy,0.778
F1-score,0.694
Precision,0.804
ROC_AUC,0.753
Recall,0.625
fit_time,47.463


Starting evaluation process on MyBaggingID3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670568533299956, max=1.0…

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
Accuracy,0.6
F1-score,0.112
Precision,0.484
ROC_AUC,0.514
Recall,0.068
fit_time,370730.614


Starting evaluation process on BaggingClassifier


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670346266679795, max=1.0…

0,1
Accuracy,0.781
F1-score,0.707
Precision,0.774
ROC_AUC,0.761
Recall,0.655
fit_time,1292.277


{'MyID3': {'fit_time': 1327.9525518417358,
  'score_time': 5.005073547363281,
  'test_Accuracy': 0.7675416133162613,
  'test_Precision': 0.7847882231191756,
  'test_Recall': 0.6248336358136721,
  'test_F1-score': 0.6844657628645079,
  'test_ROC_AUC': 0.7444756414362479},
 'DecisionTreeClassifier': {'fit_time': 47.463417053222656,
  'score_time': 4.874277114868164,
  'test_Accuracy': 0.7780852949867034,
  'test_Precision': 0.8041208630972093,
  'test_Recall': 0.6249243799153055,
  'test_F1-score': 0.6937730841624883,
  'test_ROC_AUC': 0.7533585485010702},
 'MyBaggingID3': {'fit_time': 370730.6138753891,
  'score_time': 28.141021728515625,
  'test_Accuracy': 0.5997340687481533,
  'test_Precision': 0.4840686274509804,
  'test_Recall': 0.06790683605565638,
  'test_F1-score': 0.11227894874143585,
  'test_ROC_AUC': 0.5144646225096209},
 'BaggingClassifier': {'fit_time': 1292.276954650879,
  'score_time': 5.992889404296875,
  'test_Accuracy': 0.7808578745198463,
  'test_Precision': 0.77399403