# Training a model with tabular datasets with callbacks
Example of training a model with a tabular dataset in fast.ai using callbacks

The example shown here is adapted from the paper by Howard and Gugger https://arxiv.org/pdf/2002.04688.pdf

# Prepare the notebook and ingest the dataset


In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *


In [2]:
# define path for saving models - update this path for your Gradient or Colab instance
model_path = '/notebooks/temp'

In [3]:
# adapted from code in https://github.com/fastai/fastai/issues/2832
def set_seed(dls,x=42): #must have dls, as it has an internal random.Random
    random.seed(x)
    dls.rng.seed(x) #added this line
    np.random.seed(x)
    torch.manual_seed(x)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)

In [4]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [5]:
# ingest the curated tabular dataset ADULT_SAMPLE
path = untar_data(URLs.ADULT_SAMPLE)

In [6]:
# examine the directory structure
path.ls()

(#3) [Path('/storage/data/adult_sample/export.pkl'),Path('/storage/data/adult_sample/adult.csv'),Path('/storage/data/adult_sample/models')]

In [7]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'adult.csv')

In [8]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [9]:
# get the number of records in the dataset
df.shape

(32561, 15)

In [10]:
# get the count of unique values in each column of the dataset
df.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           15
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       42
salary                2
dtype: int64

In [11]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

age                 0
workclass           0
fnlwgt              0
education           0
education-num     487
marital-status      0
occupation        512
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country      0
salary              0
dtype: int64

In [12]:
# get the subset of the dataset where age <= 40
df_young = df[df.age <= 40]
df_young.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
5,20,Private,63210,HS-grad,9.0,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,15,United-States,<50k
7,37,Private,138940,11th,7.0,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<50k
9,36,Self-emp-inc,216711,HS-grad,,Married-civ-spouse,,Husband,White,Male,99999,0,50,?,>=50k


# Define transforms, dependent variable, continuous and categorical columns
In this section we defined the transforms that will be applied to the dataset along with the target, continuous and categorical columns

In [13]:
# define transforms to apply to the tabular dataset
procs = [FillMissing,Categorify]
# define the dependent variable (y value)
dep_var = 'salary'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 

# Define TabularDataLoaders object

In [14]:
# define TabularDataLoaders object
# valid_idx: the indices to use for the validation set
dls=TabularDataLoaders.from_df(df,path,procs= procs, 
                               cat_names= cat, cont_names = cont, y_names = dep_var, valid_idx=list(range(1024,1260)), bs=64)

                               

In [15]:
# use show_batch() to see a sample batch including x and y values
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,Private,HS-grad,Never-married,Handlers-cleaners,Own-child,White,Male,Portugal,False,22.0,162667.0,9.0,0.0,0.0,50.0,<50k
1,Local-gov,HS-grad,Divorced,Protective-serv,Unmarried,White,Female,United-States,False,43.0,186995.0,9.0,0.0,0.0,40.0,<50k
2,Self-emp-not-inc,Some-college,Never-married,Craft-repair,Not-in-family,White,Female,United-States,False,18.0,42857.0,10.0,0.0,0.0,35.0,<50k
3,Private,HS-grad,Never-married,Other-service,Not-in-family,White,Male,United-States,False,40.0,209040.0,9.0,0.0,0.0,40.0,<50k
4,Private,Bachelors,Divorced,Adm-clerical,Not-in-family,White,Female,United-States,False,50.0,77905.0,13.0,0.0,0.0,8.0,<50k
5,?,Prof-school,Married-civ-spouse,?,Husband,White,Male,United-States,False,63.0,247986.0,15.0,0.0,0.0,30.0,>=50k
6,Private,12th,Never-married,Sales,Own-child,Black,Female,United-States,False,18.0,311795.0,8.0,0.0,0.0,20.0,<50k
7,Local-gov,Masters,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,False,43.0,174395.0,14.0,0.0,0.0,50.0,<50k
8,Self-emp-inc,HS-grad,Never-married,Sales,Not-in-family,White,Male,United-States,False,58.0,190541.0,9.0,0.0,0.0,47.0,<50k
9,Private,HS-grad,Married-civ-spouse,Other-service,Husband,White,Male,?,False,34.0,609789.0,9.0,0.0,0.0,30.0,<50k


# Define and train the model with no callbacks

In [16]:
%%time
set_seed(dls,x=42)
learn = tabular_learner(dls,layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.339484,0.361489,0.822034,00:09
1,0.339429,0.366499,0.826271,00:08
2,0.323125,0.32137,0.864407,00:08
3,0.325409,0.342183,0.838983,00:08
4,0.326003,0.335209,0.84322,00:08
5,0.317508,0.336868,0.830508,00:08
6,0.310379,0.330359,0.830508,00:08
7,0.29792,0.334773,0.84322,00:08
8,0.297238,0.333442,0.84322,00:08
9,0.282386,0.332747,0.84322,00:08


CPU times: user 1min 25s, sys: 643 ms, total: 1min 26s
Wall time: 1min 26s


In [17]:
learn.validate()

(#2) [0.3327472507953644,0.8432203531265259]

# Define and train the model with early stop callback

In [18]:
%%time
set_seed(dls,x=42)
learn_es = tabular_learner(dls,layers=[200,100], metrics=accuracy)
learn_es.fit_one_cycle(10,cbs=EarlyStoppingCallback(monitor='accuracy', min_delta=0.01, patience=3))

epoch,train_loss,valid_loss,accuracy,time
0,0.339484,0.361489,0.822034,00:08
1,0.339429,0.366499,0.826271,00:08
2,0.323125,0.32137,0.864407,00:08
3,0.325409,0.342183,0.838983,00:08
4,0.326003,0.335209,0.84322,00:08
5,0.317508,0.336868,0.830508,00:08


No improvement since epoch 2: early stopping
CPU times: user 51.9 s, sys: 294 ms, total: 52.2 s
Wall time: 52.3 s


In [19]:
learn_es.validate()

(#2) [0.33686837553977966,0.8305084705352783]

# Define and train the model with early stop and save model callbacks

In [20]:
%%time
set_seed(dls,x=42)
learn_es_sm = tabular_learner(dls,layers=[200,100], metrics=accuracy)
keep_path = learn_es_sm.path
# set the model path to a writeable directory. If you don't do this, the code will produce an error on Gradient
#learn_es_sm.path = Path('/notebooks/temp/models')
learn_es_sm.path = Path(model_path)
learn_es_sm.fit_one_cycle(10,cbs=[EarlyStoppingCallback(monitor='accuracy', min_delta=0.01, patience=3),SaveModelCallback(monitor='accuracy', min_delta=0.01)])
# reset the model path
learn_es_sm.path = keep_path

epoch,train_loss,valid_loss,accuracy,time
0,0.339484,0.361489,0.822034,00:08
1,0.339429,0.366499,0.826271,00:08
2,0.323125,0.32137,0.864407,00:08
3,0.325409,0.342183,0.838983,00:08
4,0.326003,0.335209,0.84322,00:08
5,0.317508,0.336868,0.830508,00:09


Better model found at epoch 0 with accuracy value: 0.8220338821411133.
Better model found at epoch 2 with accuracy value: 0.8644067645072937.
No improvement since epoch 2: early stopping
CPU times: user 52.9 s, sys: 303 ms, total: 53.2 s
Wall time: 53.4 s


  elif with_opt: warn("Saved filed doesn't contain an optimizer state.")


In [21]:
learn_es_sm.validate()

(#2) [0.32137033343315125,0.8644067645072937]

# Examine the structure of the trained model structure

Use the summary() function to see the structure of the trained model, including:

- the layers that make up the model
- total parameters
- loss function
- optimizer function
- callbacks



In [None]:
# get the summary for the model with no callbacks
learn.summary()

In [None]:
# get the summary for the model with an early stopping callback
learn_es.summary()

In [None]:
# get the summary for the model with an early stopping callback and a model saving callback
learn_es_sm.summary()