## overview

In [1]:
# default package
import logging
import sys 
import os
import pathlib
import IPython
import random
from urllib.request import urlretrieve
import dataclasses as dc
import tempfile

In [2]:
# third party package
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
import matplotlib
from autogluon.tabular import TabularDataset, TabularPredictor

In [3]:
# my package
sys.path.append(os.path.join(pathlib.Path().resolve(),"../"))

In [4]:
# reload settings
%load_ext autoreload
%autoreload 2

In [5]:
# logger
logger=logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [6]:
# graph setting
sns.set()
font_path = "/usr/share/fonts/truetype/migmix/migmix-1p-regular.ttf"
font_prop = FontProperties(fname=font_path)
matplotlib.rcParams["font.family"] = font_prop.get_name()

## load train data

In [7]:
train_data=TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')

In [8]:
train_data=train_data.sample(n=500,random_state=0)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
6118,51,Private,39264,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
23204,58,Private,51662,10th,6,Married-civ-spouse,Other-service,Wife,White,Female,0,0,8,United-States,<=50K
29590,40,Private,326310,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,<=50K
18116,37,Private,222450,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,2339,40,El-Salvador,<=50K
33964,62,Private,109190,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K


## train

In [9]:
label = 'class'
save_path = '../models/AutogluonModels'  
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "../models/AutogluonModels/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' >50K', ' <=50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =  >50K, class 0 =  <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive ( >50K) vs negative ( <=50K) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    9486.35 MB
	Tr

## load test data

In [10]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
y_test = test_data[label]  
test_data_nolab = test_data.drop(columns=[label])  
test_data_nolab.head()

Loaded data from: https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,31,Private,169085,11th,7,Married-civ-spouse,Sales,Wife,White,Female,0,0,20,United-States
1,17,Self-emp-not-inc,226203,12th,8,Never-married,Sales,Own-child,White,Male,0,0,45,United-States
2,47,Private,54260,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,60,United-States
3,21,Private,176262,Some-college,10,Never-married,Exec-managerial,Own-child,White,Female,0,0,30,United-States
4,17,Private,241185,12th,8,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States


## predict

In [11]:
y_pred=predictor.predict(test_data_nolab)
pred_probs = predictor.predict_proba(test_data_nolab)

In [12]:
y_pred

0        <=50K
1        <=50K
2         >50K
3        <=50K
4        <=50K
         ...  
9764     <=50K
9765     <=50K
9766     <=50K
9767     <=50K
9768     <=50K
Name: class, Length: 9769, dtype: object

In [13]:
pred_probs.head(5)

Unnamed: 0,<=50K,>50K
0,0.949797,0.050203
1,0.945973,0.054027
2,0.433299,0.566701
3,0.991393,0.008607
4,0.949908,0.050092


In [14]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8397993653393387
Evaluations on test data:
{
    "accuracy": 0.8397993653393387,
    "balanced_accuracy": 0.7437076677780596,
    "mcc": 0.5295565206264157,
    "f1": 0.6242496998799519,
    "precision": 0.7038440714672441,
    "recall": 0.5608283002588438
}


In [15]:
predictor.leaderboard(test_data,silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.842666,0.85,0.070455,0.005955,0.133345,0.070455,0.005955,0.133345,1,True,11
1,RandomForestGini,0.841335,0.84,0.210777,0.074643,1.000353,0.210777,0.074643,1.000353,1,True,5
2,RandomForestEntr,0.840721,0.83,0.226019,0.07276,0.988837,0.226019,0.07276,0.988837,1,True,6
3,LightGBM,0.839799,0.85,0.019053,0.011646,1.263745,0.019053,0.011646,1.263745,1,True,4
4,WeightedEnsemble_L2,0.839799,0.85,0.021561,0.012201,1.648108,0.002508,0.000555,0.384363,2,True,14
5,LightGBMXT,0.83939,0.83,0.021128,0.013353,0.300615,0.021128,0.013353,0.300615,1,True,3
6,CatBoost,0.837957,0.84,0.017886,0.010824,0.354236,0.017886,0.010824,0.354236,1,True,7
7,ExtraTreesEntr,0.834783,0.82,0.199178,0.073653,0.998703,0.199178,0.073653,0.998703,1,True,9
8,ExtraTreesGini,0.834476,0.82,0.216009,0.075349,0.995111,0.216009,0.075349,0.995111,1,True,8
9,LightGBMLarge,0.827823,0.83,0.022927,0.013216,1.211954,0.022927,0.013216,1.211954,1,True,13


In [16]:
predictor.problem_type

'binary'

In [17]:
predictor.feature_metadata

<autogluon.core.features.feature_metadata.FeatureMetadata at 0x7fc7c9236ee0>

## maximizing predictive performance

In [18]:
time_limit = 10
metric = 'roc_auc'  
predictor = TabularPredictor(label, eval_metric=metric,path="../models/AutogluonModels").fit(train_data, time_limit=time_limit, presets='best_quality')
predictor.leaderboard(test_data, silent=True)

Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 10s
AutoGluon will save models to "../models/AutogluonModels/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' >50K', ' <=50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =  >50K, class 0 =  <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive ( >50K) vs negative ( <=50K) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.902783,0.887489,0.067369,0.045379,2.636528,0.067369,0.045379,2.636528,1,True,7
1,LightGBMXT_BAG_L1,0.900161,0.88138,0.121851,0.055607,0.891131,0.121851,0.055607,0.891131,1,True,3
2,WeightedEnsemble_L2,0.898389,0.899442,0.855168,0.384333,7.455956,0.004354,0.001101,0.888509,2,True,10
3,LightGBM_BAG_L1,0.892347,0.866991,0.073432,0.071198,2.298749,0.073432,0.071198,2.298749,1,True,4
4,RandomForestEntr_BAG_L1,0.888119,0.886301,0.223592,0.099532,1.034022,0.223592,0.099532,1.034022,1,True,6
5,RandomForestGini_BAG_L1,0.886598,0.884698,0.225724,0.083766,0.963602,0.225724,0.083766,0.963602,1,True,5
6,ExtraTreesGini_BAG_L1,0.881065,0.892927,0.186173,0.090512,0.995592,0.186173,0.090512,0.995592,1,True,8
7,ExtraTreesEntr_BAG_L1,0.880851,0.893912,0.251829,0.092202,1.010175,0.251829,0.092202,1.010175,1,True,9
8,KNeighborsDist_BAG_L1,0.525998,0.536956,0.02043,0.008027,0.001714,0.02043,0.008027,0.001714,1,True,2
9,KNeighborsUnif_BAG_L1,0.51497,0.519604,0.019319,0.009029,0.002117,0.019319,0.009029,0.002117,1,True,1


## regression

In [19]:
age_column="age"
predictor_age=TabularPredictor(label=age_column,path="../models/AutogluonModels").fit(train_data)
performance = predictor_age.evaluate(test_data)

Beginning AutoGluon training ...
AutoGluon will save models to "../models/AutogluonModels/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (85, 17, 39.652, 13.52393)
	If 'regression' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    9679.16 MB
	Train Data (Original)  Memory Usage: 0.32 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Ge

In [20]:
predictor_age.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-10.541613,-11.263276,8.330542,0.349694,22.961126,0.00446,0.000467,0.530138,2,True,12
1,CatBoost,-10.579617,-11.916622,0.025914,0.011271,0.398379,0.025914,0.011271,0.398379,1,True,6
2,ExtraTreesMSE,-10.691115,-11.480752,0.169004,0.068667,0.917422,0.169004,0.068667,0.917422,1,True,7
3,RandomForestMSE,-10.746518,-11.602848,0.154411,0.067874,0.801137,0.154411,0.067874,0.801137,1,True,5
4,LightGBMXT,-10.753344,-11.814712,0.048232,0.012587,0.190649,0.048232,0.012587,0.190649,1,True,3
5,LightGBM,-10.972156,-11.929546,0.02907,0.011835,0.170528,0.02907,0.011835,0.170528,1,True,4
6,XGBoost,-11.121008,-12.17427,0.078902,0.008004,2.059135,0.078902,0.008004,2.059135,1,True,9
7,LightGBMLarge,-11.598649,-12.167606,0.053989,0.012129,1.47336,0.053989,0.012129,1.47336,1,True,11
8,NeuralNetMXNet,-11.861902,-12.515734,7.82715,0.171145,18.288753,7.82715,0.171145,18.288753,1,True,10
9,KNeighborsUnif,-14.902058,-15.686937,0.019096,0.007803,0.002781,0.019096,0.007803,0.002781,1,True,1
