# Saving a model trained with a tabular dataset in fast.ai
- Example of saving and reloading a model trained with a tabular dataset in fast.ai.
- This notebook is an extension of 

The example shown here is adapted from the paper by Howard and Gugger https://arxiv.org/pdf/2002.04688.pdf

# Prepare the notebook and ingest the dataset
The first section of this notebook is identical to the chapter 2 notebook for examining tabular curated datasets: https://github.com/PacktPublishing/Deep-Learning-with-fastai-Cookbook/blob/main/ch2/examining_tabular_datasets.ipynb

In [108]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *


In [109]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [110]:
# ingest the curated tabular dataset ADULT_SAMPLE
path = untar_data(URLs.ADULT_SAMPLE)

In [111]:
# examine the directory structure
path.ls()

(#5) [Path('/storage/data/adult_sample/export.pkl'),Path('/storage/data/adult_sample/adult.csv'),Path('/storage/data/adult_sample/adult_sample_model.pkl'),Path('/storage/data/adult_sample/adult_sample_modle.pkl'),Path('/storage/data/adult_sample/models')]

In [112]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'adult.csv')

In [113]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [114]:
# get the number of records in the dataset
df.shape

(32561, 15)

In [115]:
# get the count of unique values in each column of the dataset
df.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           15
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       42
salary                2
dtype: int64

In [116]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

age                 0
workclass           0
fnlwgt              0
education           0
education-num     487
marital-status      0
occupation        512
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country      0
salary              0
dtype: int64

In [117]:
# get the subset of the dataset where age <= 40
# streetcarjan2014[streetcarjan2014.Location == "King and Shaw"].Route
df_young = df[df.age <= 40]
df_young.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
5,20,Private,63210,HS-grad,9.0,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,15,United-States,<50k
7,37,Private,138940,11th,7.0,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<50k
9,36,Self-emp-inc,216711,HS-grad,,Married-civ-spouse,,Husband,White,Male,99999,0,50,?,>=50k


# Define transforms, dependent variable, continuous and categorical columns
In this section we defined the transforms that will be applied to the dataset along with the target, continuous and categorical columns

In [118]:
# define transforms to apply to the tabular dataset
procs = [FillMissing,Categorify]
# define the dependent variable (y value)
dep_var = 'salary'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 

# Define TabularDataLoaders object

In [119]:
# define TabularDataLoaders object
# valid_idx: the indices to use for the validation set
dls=TabularDataLoaders.from_df(df,path,procs= procs, 
                               cat_names= cat, cont_names = cont, y_names = dep_var, valid_idx=list(range(1024,1260)), bs=64)
                               

In [120]:
# use show_batch() to see a sample batch including x and y values
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,Local-gov,Bachelors,Divorced,Prof-specialty,Unmarried,White,Female,United-States,False,49.0,106554.0,13.0,0.0,0.0,40.0,>=50k
1,Private,11th,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,False,35.0,98389.0,7.0,0.0,0.0,40.0,<50k
2,Federal-gov,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,False,51.0,190333.0,10.0,0.0,0.0,40.0,<50k
3,Private,HS-grad,Married-civ-spouse,Other-service,Husband,White,Male,United-States,False,41.0,58124.0,9.0,0.0,0.0,40.0,<50k
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,False,29.0,67218.0,13.0,0.0,0.0,40.0,>=50k
5,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,False,45.0,120131.0,9.0,0.0,0.0,50.0,>=50k
6,?,Some-college,Married-civ-spouse,?,Husband,White,Male,United-States,False,67.0,132626.0,10.0,0.0,0.0,6.0,<50k
7,Private,Assoc-voc,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,United-States,False,38.0,224566.0,11.0,0.0,0.0,50.0,<50k
8,State-gov,HS-grad,Separated,Adm-clerical,Unmarried,White,Female,United-States,False,46.0,111163.0,9.0,0.0,0.0,38.0,<50k
9,Private,Assoc-voc,Separated,Craft-repair,Not-in-family,White,Male,United-States,False,41.0,139907.0,11.0,0.0,0.0,30.0,<50k


# Define and train model

In [121]:
learn = tabular_learner(dls,layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.330899,0.358703,0.826271,00:06
1,0.3236,0.344936,0.830508,00:06
2,0.308774,0.337703,0.830508,00:06


In [122]:
# show sample result, including transformed x, y and predicted transformed y
learn.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,salary_pred
0,5.0,6.0,5.0,7.0,2.0,5.0,1.0,40.0,1.0,58.0,192806.0,4.0,0.0,0.0,33.0,0.0,0.0
1,5.0,9.0,3.0,4.0,1.0,5.0,2.0,40.0,1.0,56.0,142182.0,11.0,0.0,0.0,45.0,1.0,1.0
2,5.0,16.0,1.0,13.0,2.0,5.0,2.0,40.0,1.0,33.0,203488.0,10.0,0.0,0.0,40.0,0.0,0.0
3,5.0,16.0,5.0,2.0,4.0,5.0,1.0,40.0,1.0,42.0,184823.0,10.0,0.0,0.0,40.0,0.0,0.0
4,5.0,10.0,3.0,11.0,1.0,5.0,2.0,40.0,1.0,41.0,352834.0,13.0,7688.0,0.0,55.0,1.0,1.0
5,5.0,12.0,3.0,2.0,1.0,5.0,2.0,40.0,1.0,34.0,112212.0,9.0,0.0,1485.0,40.0,0.0,0.0
6,8.0,13.0,3.0,2.0,1.0,5.0,2.0,40.0,1.0,29.0,95423.0,14.0,0.0,0.0,36.0,0.0,0.0
7,5.0,13.0,3.0,5.0,1.0,5.0,2.0,40.0,1.0,45.0,216932.0,14.0,0.0,0.0,40.0,1.0,1.0
8,5.0,10.0,1.0,5.0,2.0,5.0,1.0,40.0,1.0,49.0,379779.0,13.0,0.0,0.0,40.0,0.0,0.0


# Examine the structure of the trained model structure

Use the summary() function to see the structure of the trained model, including:

- the layers that make up the model
- total parameters
- loss function
- optimizer function
- callbacks



In [123]:
learn.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


TabularModel (Input shape: ['64 x 9', '64 x 6'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 6               60         True      
________________________________________________________________
Embedding            64 x 8               136        True      
________________________________________________________________
Embedding            64 x 5               40         True      
________________________________________________________________
Embedding            64 x 8               128        True      
________________________________________________________________
Embedding            64 x 5               35         True      
________________________________________________________________
Embedding            64 x 4               24         True      
________________________________________________________________
Embedding            64 x 3               9          True      
_________________________________________________

# Save the trained model

In [124]:
# save the trained model in /storage/data/adult_sample/adult_sample_model.pkl
learn.export('adult_sample_model.pkl')

In [125]:
learn2 = load_learner('/storage/data/adult_sample/adult_sample_model.pkl')


In [126]:
df_test = pd.read_csv('/notebooks/temp/adult_sample_test.csv')

In [127]:
df_test.shape

(128, 14)

In [128]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States


In [130]:
df_test.iloc[0]

age                                49
workclass                     Private
fnlwgt                         101320
education                  Assoc-acdm
education-num                      12
marital-status     Married-civ-spouse
occupation                        NaN
relationship                     Wife
race                            White
sex                            Female
capital-gain                        0
capital-loss                     1902
hours-per-week                     40
native-country          United-States
Name: 0, dtype: object

In [129]:
test_sample = df_test.iloc[0]
learn2.predict(test_sample)

(   workclass  education  marital-status  occupation  relationship  race  sex  \
 0        5.0        8.0             3.0         0.0           6.0   5.0  1.0   
 
    native-country  education-num_na   age    fnlwgt  education-num  \
 0            40.0               1.0  49.0  101320.0           12.0   
 
    capital-gain  capital-loss  hours-per-week  salary  
 0           0.0        1902.0            40.0     1.0  ,
 tensor(1),
 tensor([0.2312, 0.7688]))