 # 🤖 Automated Machine Learning-Classification
  
This notebook provides Automated Machine Learning (AutoML) algorithms for a multi-class classification task. Data preparation is just simply performed as the pre-processing will be automatically done, followed by building Machine Learning algorithms and tuning the hyperparameters. The objective of this notebook is to serve as a cheat sheet.

In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [None]:
# Make scorer: accuracy
accuracy = make_scorer(accuracy_score)

In [None]:
# Load dataset
trainSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
testSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
submitSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/sample_submission.csv')

trainSet.head()

The task is to predict which poverty class each household is in. There are 4 classes of poverty level: 1 = extreme poverty, 2 = moderate poverty, 3 = vulnerable households, and 4 = non vulnerable households.

In [None]:
# Drop columns with lacking data
train = trainSet.drop(columns=['Id','idhogar','rez_esc', 'v18q1', 'v2a1', 'dependency', 'edjefe', 'edjefa'])

# Drop rows with missing values
train = train.dropna(axis=0)

print(train.shape)
train.head()

In [None]:
# Select features with high importance
selected = ['tipovivi5', 'hogar_mayor', 'abastaguano', 'epared2', 'area1',
       'tipovivi1', 'elimbasu2', 'refrig', 'mobilephone', 'energcocinar2',
       'pisocemento', 'pareddes', 'elimbasu1', 'etecho2', 'lugar4',
       'paredmad', 'paredzinc', 'lugar1', 'tamviv', 'lugar3',
       'television', 'rooms', 'epared1', 'tipovivi3', 'etecho1',
       'SQBedjefe', 'epared3', 'parentesco9', 'bedrooms', 'r4m2',
       'overcrowding', 'sanitario5', 'paredzocalo', 'eviv1', 'paredpreb',
       'etecho3', 'abastaguadentro', 'r4m3', 'techozinc', 'pisomadera',
       'sanitario3', 'eviv2', 'tamhog', 'v14a', 'r4t2', 'public',
       'lugar5', 'elimbasu3', 'r4m1', 'hacdor', 'r4t3', 'energcocinar4',
       'r4h1', 'sanitario2', 'hogar_adul', 'r4h2', 'cielorazo',
       'qmobilephone', 'tipovivi4', 'pisomoscer', 'meaneduc', 'tipovivi2',
       'paredblolad', 'computer', 'r4t1', 'pisonotiene', 'SQBdependency',
       'eviv3', 'hacapo', 'hogar_nin', 'v18q']

To find the process of feature selection, please visit this notebook https://www.kaggle.com/rendyk/multi-classclassification-accuracy-povertylevel

That notebook demonstrates regression using conventional Machine Learning algorithms for learning the same dataset.

In [None]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(train[selected], train['Target'],
                                                  test_size=0.2, random_state=123,
                                                  stratify=train['Target'])

# 1. Auto-Sklearn

In [None]:
!apt install -y build-essential swig curl
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

In [None]:
from autosklearn.classification import AutoSklearnClassifier

In [None]:
# Create the model
sklearn = AutoSklearnClassifier(time_left_for_this_task=3*60, per_run_time_limit=15, n_jobs=-1)

# Fit the training data
sklearn.fit(X_train, y_train)

# Sprint Statistics
print(sklearn.sprint_statistics())

# Predict the validation data
pred_sklearn = sklearn.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_sklearn)))

In [None]:
# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_sklearn), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_sklearn))

In [None]:
# Show the models
print(sklearn.show_models())

# 2. Tree-based Pipeline Optimization Tool (TPOT)

In [None]:
from tpot import TPOTClassifier

In [None]:
# TPOT that are stopped earlier. It still gives temporary best pipeline.
# Create model
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=123) 
tpot = TPOTClassifier(generations=8, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=123, n_jobs=-1)

# Fir the training data
tpot.fit(X_train, y_train)

# Export the result
tpot.export('tpot_model.py')

The above cell takes too long time to finish. Thus, it is stopped earlier and it shows the temporary result.

In [None]:
# Create the model
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=123) 
tpot = TPOTClassifier(generations=5, population_size=5, cv=cv, scoring='accuracy', verbosity=2, random_state=123, n_jobs=-1)

# Fit the training data
tpot.fit(X_train, y_train)

# Export the result
tpot.export('tpot_model.py')

Below is the tpot_model.py with a little adjustment.

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
#tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
#features = tpot_data.drop('target', axis=1)
#training_features, testing_features, training_target, testing_target = \
#            train_test_split(features, tpot_data['target'], random_state=123)

training_features = X_train
testing_features = X_val
training_target= y_train
testing_target = y_val

# Average CV score on the training set was: 0.8822579941543428
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.5, min_samples_leaf=13, min_samples_split=13, n_estimators=100)),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.8500000000000001, min_samples_leaf=4, min_samples_split=12, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
#tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
#features = tpot_data.drop('target', axis=1)
#training_features, testing_features, training_target, testing_target = \
#            train_test_split(features, tpot_data['target'], random_state=123)

training_features = X_train
testing_features = X_val
training_target= y_train
testing_target = y_val

# Average CV score on the training set was: 0.859616978580465
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.001, max_depth=2, max_features=0.7000000000000001, min_samples_leaf=1, min_samples_split=19, n_estimators=100, subsample=0.15000000000000002)),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.8500000000000001, min_samples_leaf=4, min_samples_split=12, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [None]:
pred_tpot = results

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_tpot)))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_tpot), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_tpot))

# 3. Hyperopt

In [None]:
!pip install git+https://github.com/hyperopt/hyperopt-sklearn.git

In [None]:
from hpsklearn import HyperoptEstimator
from hpsklearn import any_classifier
from hpsklearn import any_preprocessing
from hyperopt import tpe

In [None]:
# Convert data into array
X_train_ar = np.array(X_train)
X_val_ar = np.array(X_val)
y_train_ar = np.array(y_train)
y_val_ar = np.array(y_val)

In [None]:
# Create the model
hyperopt = HyperoptEstimator(classifier=any_classifier('cla'), preprocessing=any_preprocessing('pre'),
                             algo=tpe.suggest, max_evals=50, trial_timeout=30)

# Fit the training data
hyperopt.fit(X_train_ar, y_train_ar)

Hyperopr autoML stopped earlier. I have checked with online discussions and found that many people encountered the same problem. But, we can still retrieve the (intermediate) result although it is not the final result.

In [None]:
# Predict the validation data
pred_hyperopt = hyperopt.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_hyperopt)))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_hyperopt), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_hyperopt))

In [None]:
# Show the models
print(hyperopt.best_model())

#  4. AutoKeras

In [None]:
!pip install autokeras

In [None]:
import autokeras

In [None]:
# Create the model
keras = autokeras.StructuredDataClassifier(max_trials=8)

# Fit the training dataset
keras.fit(X_train, y_train, epochs=100)

# Predict the validation data
pred_keras = keras.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_keras)))

In [None]:
# Convert predicted result into pandas series with numeric type
pred_keras_ = pd.DataFrame(pred_keras)
pred_keras_ = pred_keras_[0]
pred_keras_ = pd.to_numeric(pred_keras_)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_keras_)))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_keras_), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_keras_))

In [None]:
# Show the built models
keras_export = keras.export_model()
keras_export.summary() # Scroll to the end of the warnings to find the neural network summary

# 5. MLJAR

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master

In [None]:
from supervised.automl import AutoML

In [None]:
# Create the model
mljar = AutoML(mode="Compete",  eval_metric="accuracy", total_time_limit=300,
               features_selection=True)

# Fit the training data
mljar.fit(X_train, y_train)

In [None]:
# Predict the validation data
pred_mljar = sklearn.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_mljar)))

In [None]:
# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_mljar), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_mljar))

In [None]:
# Show the model results
mljar.report()

# 6. AutoGluon

In [None]:
!pip install -U pip
!pip install -U setuptools wheel
!pip install -U "mxnet<2.0.0"
!pip install autogluon  

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
# Prepare the data
Xy_train = X_train.reset_index(drop=True)
Xy_train['Target'] = y_train

Xy_val = X_val.reset_index(drop=True)
Xy_val['Target'] = y_val

X_train_gluon = TabularDataset(Xy_train)
X_val_gluon = TabularDataset(Xy_val)

# Fit the training data
gluon = TabularPredictor(label='Target').fit(X_train_gluon, time_limit=120)

In [None]:
# Predict the training data
gluon_pred = gluon.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, gluon_pred)))

In [None]:
# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, gluon_pred), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, gluon_pred))

In [None]:
# Show the models
leaderboard = gluon.leaderboard(X_train_gluon)

In [None]:
leaderboard

# 7. H2O

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
# Convert H2O Frame
Xy_train_h2o = h2o.H2OFrame(Xy_train)
X_val_h2o = h2o.H2OFrame(X_val)

In [None]:
Xy_train_h2o['Target'] = Xy_train_h2o['Target'].asfactor()

In [None]:
# Create the model
h2o_model = H2OAutoML(max_runtime_secs=120, seed=123)

# Fit the model
h2o_model.train(x=Xy_train_h2o.columns, y='Target', training_frame=Xy_train_h2o)

In [None]:
# Predict the training data
h2o_pred = h2o_model.predict(X_val_h2o)
h2o_pred

In [None]:
# Convert back H2ODataFrame to Pandas DataFrame
h2o_pred_ = h2o.as_list(h2o_pred['predict'])
h2o_pred_ = h2o_pred_['predict']

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, h2o_pred_)))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, h2o_pred_), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, h2o_pred_))

In [None]:
# Show the model results
leaderboard_h2o = h2o.automl.get_leaderboard(h2o_model, extra_columns = 'ALL')
leaderboard_h2o