 # 🤖 Automated Machine Learning-Classification
  
This notebook provides Automated Machine Learning (AutoML) algorithms for a multi-class classification task. Data preparation is just simply performed as the pre-processing will be automatically done, followed by building Machine Learning algorithms and tuning the hyperparameters. The objective of this notebook is to serve as a cheat sheet.

In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [None]:
# Make scorer: accuracy
accuracy = make_scorer(accuracy_score)

In [None]:
# Load dataset
trainSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
testSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
submitSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/sample_submission.csv')

trainSet.head()

The task is to predict which poverty class each household is in. There are 4 classes of poverty level: 1 = extreme poverty, 2 = moderate poverty, 3 = vulnerable households, and 4 = non vulnerable households.

In [None]:
# Drop columns with lacking data
train = trainSet.drop(columns=['Id','idhogar','rez_esc', 'v18q1', 'v2a1', 'dependency', 'edjefe', 'edjefa'])

# Drop rows with missing values
train = train.dropna(axis=0)

print(train.shape)
train.head()

In [None]:
# Select features with high importance
selected = ['tipovivi5', 'hogar_mayor', 'abastaguano', 'epared2', 'area1',
       'tipovivi1', 'elimbasu2', 'refrig', 'mobilephone', 'energcocinar2',
       'pisocemento', 'pareddes', 'elimbasu1', 'etecho2', 'lugar4',
       'paredmad', 'paredzinc', 'lugar1', 'tamviv', 'lugar3',
       'television', 'rooms', 'epared1', 'tipovivi3', 'etecho1',
       'SQBedjefe', 'epared3', 'parentesco9', 'bedrooms', 'r4m2',
       'overcrowding', 'sanitario5', 'paredzocalo', 'eviv1', 'paredpreb',
       'etecho3', 'abastaguadentro', 'r4m3', 'techozinc', 'pisomadera',
       'sanitario3', 'eviv2', 'tamhog', 'v14a', 'r4t2', 'public',
       'lugar5', 'elimbasu3', 'r4m1', 'hacdor', 'r4t3', 'energcocinar4',
       'r4h1', 'sanitario2', 'hogar_adul', 'r4h2', 'cielorazo',
       'qmobilephone', 'tipovivi4', 'pisomoscer', 'meaneduc', 'tipovivi2',
       'paredblolad', 'computer', 'r4t1', 'pisonotiene', 'SQBdependency',
       'eviv3', 'hacapo', 'hogar_nin', 'v18q']

To find the process of feature selection, please visit this notebook https://www.kaggle.com/rendyk/multi-classclassification-accuracy-povertylevel

That notebook demonstrates regression using conventional Machine Learning algorithms for learning the same dataset.

In [None]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(train[selected], train['Target'],
                                                  test_size=0.2, random_state=123,
                                                  stratify=train['Target'])

Please find the part 1 here https://www.kaggle.com/rendyk/automl-for-classification

# 8. PyCaret

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
# Generate
val_index = np.random.choice(range(trainSet.shape[0]), round(trainSet.shape[0]*0.2), replace=False)

# Split trainSet
trainSet1 = trainSet.drop(val_index)
trainSet2 = trainSet.iloc[val_index,:]

In [None]:
# Create the model
caret = setup(data = trainSet1, target='Target', session_id=123,
              numeric_imputation='mean',  categorical_imputation='constant',
              normalize = True, combine_rare_levels = True, rare_level_threshold = 0.05,
              remove_multicollinearity = True, multicollinearity_threshold = 0.95)

In [None]:
# Show the models
caret_models = compare_models(fold=5)

In [None]:
# Create the top 5 models
et = create_model('et', fold=5)
rf = create_model('rf', fold=5)
dt = create_model('dt', fold=5)
xgboost = create_model('xgboost', fold=5)
lightgbm = create_model('lightgbm', fold=5)
# If each algorithm is created in 1 cell, each output will show each cross-validation result.
# Below is the cross_validation report of lightGBM as the it is the last line

In [None]:
# Tune the models, for example for LightGBM
lightgbm_tune = tune_model(lightgbm, fold=5)

In [None]:
# Show the tuned hyperparameters, for example for LightGBM
plot_model(lightgbm_tune, plot='parameter')

In [None]:
# Bagging LightGBM
lightgbm_bagging = ensemble_model(lightgbm_tune, fold=5)

In [None]:
# Boosting LightGBM
lightgbm_boost = ensemble_model(lightgbm_tune, method='Boosting', fold=5)

In [None]:
# Return top 5 models
caret_models_5 = compare_models(n_select=5)

In [None]:
# Stacking with GBM as the meta-model
stack = stack_models(caret_models_5, meta_model=et, fold=5)

In [None]:
# Blending top models
caret_blend = blend_models(estimator_list=[lightgbm_tune,rf,dt])

In [None]:
# Predict the validation data
pred_caret = predict_model(caret_blend, data = trainSet2.drop(columns=['Target']))
pred_caret = pred_caret['Label']

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(trainSet2['Target'], pred_caret)))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(trainSet2['Target'], pred_caret), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(trainSet2['Target'], pred_caret))

# 9. AutoViML

In [None]:
!pip install autoviml
!pip install shap

In [None]:
from autoviml.Auto_ViML import Auto_ViML

In [None]:
# Create the model
viml, features, train_v, test_v = Auto_ViML(trainSet1, 'Target', trainSet2.drop(columns=['Target']),
                                            scoring_parameter='balanced_accuracy', hyper_param='RS',
                                            feature_reduction=True, Boosting_Flag=True,
                                            Binning_Flag=False,Add_Poly=0, Stacking_Flag=False, 
                                            Imbalanced_Flag=True, verbose=1)

In [None]:
viml
# The model picks XGBClassifier

In [None]:
# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(trainSet2['Target'], test_v['Target_Ensembled_predictions'])))
print('')

# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(trainSet2['Target'], test_v['Target_Ensembled_predictions']), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(trainSet2['Target'], test_v['Target_Ensembled_predictions']))

# 10. LightAutoML

In [None]:
!pip install -U https://github.com/sberbank-ai-lab/LightAutoML/raw/fix/logging/LightAutoML-0.2.16.2-py3-none-any.whl
!pip install openpyxl

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
train_data = pd.concat([X_train, y_train], axis=1)
train_data.head()

In [None]:
# Create the model
light = TabularAutoML(task=Task('multiclass',), timeout=60*3, cpu_limit=4)

# Fit the training data
train_light = light.fit_predict(train_data, roles = {'target': 'Target'})

# Predict the validation data
pred_light = light.predict(X_val)

In [None]:
# Convert the prediction result into dataframe
pred_light2 = pred_light.data
pred_light2 = pd.DataFrame(pred_light2, columns=['4','2','3','1'])
pred_light2 = pred_light2[['1','2','3','4']]
pred_light2['Pred'] = pred_light2.idxmax(axis=1)
pred_light2['Pred'] = pred_light2['Pred'].astype(int)
pred_light2.head()

In [None]:
# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_light2['Pred']), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_light2['Pred']))