# Training a model with tabular datasets with callbacks
Example of training a model with a tabular dataset in fast.ai using callbacks

The example shown here is adapted from the paper by Howard and Gugger https://arxiv.org/pdf/2002.04688.pdf

# Prepare the notebook and ingest the dataset


In [17]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *


In [18]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [19]:
# ingest the curated tabular dataset ADULT_SAMPLE
path = untar_data(URLs.ADULT_SAMPLE)

In [20]:
# examine the directory structure
path.ls()

(#3) [Path('/storage/data/adult_sample/export.pkl'),Path('/storage/data/adult_sample/adult.csv'),Path('/storage/data/adult_sample/models')]

In [21]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'adult.csv')

In [22]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [23]:
# get the number of records in the dataset
df.shape

(32561, 15)

In [24]:
# get the count of unique values in each column of the dataset
df.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           15
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       42
salary                2
dtype: int64

In [25]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

age                 0
workclass           0
fnlwgt              0
education           0
education-num     487
marital-status      0
occupation        512
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country      0
salary              0
dtype: int64

In [26]:
# get the subset of the dataset where age <= 40
# streetcarjan2014[streetcarjan2014.Location == "King and Shaw"].Route
df_young = df[df.age <= 40]
df_young.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
5,20,Private,63210,HS-grad,9.0,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,15,United-States,<50k
7,37,Private,138940,11th,7.0,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<50k
9,36,Self-emp-inc,216711,HS-grad,,Married-civ-spouse,,Husband,White,Male,99999,0,50,?,>=50k


# Define transforms, dependent variable, continuous and categorical columns
In this section we defined the transforms that will be applied to the dataset along with the target, continuous and categorical columns

In [27]:
# define transforms to apply to the tabular dataset
procs = [FillMissing,Categorify]
# define the dependent variable (y value)
dep_var = 'salary'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 

# Define TabularDataLoaders object

In [28]:
# define TabularDataLoaders object
# valid_idx: the indices to use for the validation set
dls=TabularDataLoaders.from_df(df,path,procs= procs, 
                               cat_names= cat, cont_names = cont, y_names = dep_var, valid_idx=list(range(1024,1260)), bs=64)
                               

In [29]:
# use show_batch() to see a sample batch including x and y values
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,Private,11th,Separated,Farming-fishing,Other-relative,White,Male,Puerto-Rico,False,27.0,202206.0,7.0,0.0,0.0,40.0,<50k
1,Private,HS-grad,Married-civ-spouse,Sales,Husband,White,Male,United-States,False,29.0,81648.0,9.0,0.0,1887.0,55.0,>=50k
2,State-gov,12th,Married-civ-spouse,Other-service,Wife,White,Female,United-States,False,31.0,29152.0,8.0,0.0,0.0,40.0,<50k
3,Private,HS-grad,Married-civ-spouse,#na#,Husband,White,Male,Ecuador,True,29.0,253801.0,10.0,0.0,0.0,40.0,<50k
4,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,United-States,False,60.0,56248.0,10.0,0.0,0.0,40.0,<50k
5,Local-gov,Assoc-acdm,Never-married,Adm-clerical,Not-in-family,White,Female,United-States,False,28.0,304960.0,12.0,0.0,1980.0,40.0,<50k
6,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,False,40.0,198096.0,9.0,0.0,0.0,40.0,>=50k
7,Private,HS-grad,Separated,#na#,Not-in-family,Black,Female,United-States,True,36.0,358373.0,10.0,0.0,0.0,36.0,<50k
8,Private,HS-grad,Divorced,Adm-clerical,Unmarried,White,Female,United-States,False,28.0,133043.0,9.0,0.0,0.0,50.0,<50k
9,Self-emp-inc,Prof-school,Married-civ-spouse,Exec-managerial,Husband,White,Male,?,False,55.0,183869.0,15.0,0.0,0.0,50.0,>=50k


# Define and train the model with no callbacks

In [30]:
%%time
learn = tabular_learner(dls,layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.341347,0.36061,0.822034,00:10
1,0.334492,0.369722,0.809322,00:10
2,0.331509,0.331108,0.838983,00:10
3,0.312824,0.337519,0.838983,00:10
4,0.312502,0.347008,0.84322,00:10
5,0.303788,0.31989,0.851695,00:10
6,0.300724,0.315468,0.851695,00:10
7,0.305295,0.322067,0.838983,00:10
8,0.295115,0.31858,0.84322,00:10
9,0.29431,0.314723,0.847458,00:10


CPU times: user 1min 42s, sys: 626 ms, total: 1min 43s
Wall time: 1min 43s


In [31]:
learn.validate()

(#2) [0.3147226572036743,0.8474576473236084]

# Define and train the model with early stop callback

In [32]:
%%time
learn_es = tabular_learner(dls,layers=[200,100], metrics=accuracy)
learn_es.fit_one_cycle(10,cbs=EarlyStoppingCallback(monitor='accuracy', min_delta=0.01, patience=3))

epoch,train_loss,valid_loss,accuracy,time
0,0.338768,0.356871,0.838983,00:10
1,0.330822,0.346476,0.830508,00:10
2,0.331357,0.350572,0.817797,00:10


No improvement since epoch 0: early stopping
CPU times: user 31.4 s, sys: 149 ms, total: 31.6 s
Wall time: 31.7 s


In [33]:
learn_es.validate()

(#2) [0.3505721986293793,0.8177965879440308]

# Define and train the model with early stop and save model callbacks

In [44]:
%%time
learn_es_sm = tabular_learner(dls,layers=[200,100], metrics=accuracy)
keep_path = learn_es_sm.path
# set the model path to a writeable directory
learn_es_sm.path = Path('/notebooks/temp/models')
learn_es_sm.fit_one_cycle(10,cbs=[EarlyStoppingCallback(monitor='accuracy', min_delta=0.01, patience=3),SaveModelCallback(monitor='accuracy', min_delta=0.01)])
# reset the model path
learn_es_sm.path = keep_path

epoch,train_loss,valid_loss,accuracy,time
0,0.329477,0.34046,0.84322,00:10
1,0.32646,0.3427,0.838983,00:10
2,0.325995,0.344461,0.830508,00:10
3,0.305585,0.329569,0.860169,00:10
4,0.313667,0.336395,0.847458,00:10
5,0.311784,0.34137,0.838983,00:10
6,0.306178,0.332439,0.830508,00:10


Better model found at epoch 0 with accuracy value: 0.8432203531265259.
Better model found at epoch 3 with accuracy value: 0.8601694703102112.
No improvement since epoch 3: early stopping
CPU times: user 1min 15s, sys: 478 ms, total: 1min 15s
Wall time: 1min 15s


In [45]:
learn_es_sm.validate()

(#2) [0.3295692801475525,0.8601694703102112]

# Examine the structure of the trained model structure

Use the summary() function to see the structure of the trained model, including:

- the layers that make up the model
- total parameters
- loss function
- optimizer function
- callbacks



In [34]:
learn.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


TabularModel (Input shape: ['64 x 9', '64 x 6'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 6               60         True      
________________________________________________________________
Embedding            64 x 8               136        True      
________________________________________________________________
Embedding            64 x 5               40         True      
________________________________________________________________
Embedding            64 x 8               128        True      
________________________________________________________________
Embedding            64 x 5               35         True      
________________________________________________________________
Embedding            64 x 4               24         True      
________________________________________________________________
Embedding            64 x 3               9          True      
_________________________________________________

In [35]:
learn_es.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


TabularModel (Input shape: ['64 x 9', '64 x 6'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 6               60         True      
________________________________________________________________
Embedding            64 x 8               136        True      
________________________________________________________________
Embedding            64 x 5               40         True      
________________________________________________________________
Embedding            64 x 8               128        True      
________________________________________________________________
Embedding            64 x 5               35         True      
________________________________________________________________
Embedding            64 x 4               24         True      
________________________________________________________________
Embedding            64 x 3               9          True      
_________________________________________________

In [46]:
learn_es_sm.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


TabularModel (Input shape: ['64 x 9', '64 x 6'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 6               60         True      
________________________________________________________________
Embedding            64 x 8               136        True      
________________________________________________________________
Embedding            64 x 5               40         True      
________________________________________________________________
Embedding            64 x 8               128        True      
________________________________________________________________
Embedding            64 x 5               35         True      
________________________________________________________________
Embedding            64 x 4               24         True      
________________________________________________________________
Embedding            64 x 3               9          True      
_________________________________________________