# Tabular models

In [1]:
from fastai import *
from fastai.tabular import *

Tabular data should be in a Pandas `DataFrame`.

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50k
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,1
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,1
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,0
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,1
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,0


It is a classification task.

In [3]:
dep_var = '>=50k'
# Most of the times we assume that features that are numeric and have <= 7
# Unique values as categorical.
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
# Preprocess features before running the learner
# It's like sklearn's transformers
procs = [FillMissing, Categorify, Normalize]

In [4]:
# Creating a test set from the data
# We're assuming here that there is a structure in the 
# data so we make sure the validation data is contiguous
# test = TabularList.from_df(df.iloc[800:1000].copy(),
#                            path=path, cat_names=cat_names,
#                            cont_names=cont_names)

In [5]:
data = (TabularList.from_df(df, path=path, cat_names=cat_names,
                            cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(800,1000)))
                           .label_from_df(cols=dep_var)
#                            .add_test(test, label=0)  # Since test set has no labels
                           .databunch())

In [6]:
len(data.train_ds), len(data.valid_ds)

(32361, 200)

In [7]:
data.show_batch(rows=10)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,target
Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,False,0.1769,-1.2692,1.1422,1
Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,False,0.7632,0.0916,-0.0312,0
Private,Masters,Never-married,Adm-clerical,Not-in-family,White,False,-0.4828,-0.7746,1.5334,0
State-gov,Prof-school,Divorced,Prof-specialty,Unmarried,White,False,0.4701,1.2320,1.9245,0
Private,Masters,Married-civ-spouse,Exec-managerial,Husband,White,False,0.6899,-0.3861,1.5334,1
Local-gov,HS-grad,Married-civ-spouse,Other-service,Wife,White,False,-0.3362,-0.5199,-0.4224,0
Private,Some-college,Never-married,Handlers-cleaners,Not-in-family,White,False,-1.2158,-0.6257,-0.0312,0
Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,-0.7027,0.4072,-0.4224,0
?,Some-college,Never-married,?,Not-in-family,White,False,-1.4357,-1.3588,-0.0312,0
Private,12th,Married-civ-spouse,Craft-repair,Husband,White,False,-1.0692,2.4623,-0.8135,0


In [8]:
# Layers define the width of each hidden layer. The output layer
# is not usually defined since it can be derived from the number
# of classes. In this case, it is 2
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [9]:
for col in cat_names[:-1]:
    print(f'{col} : {df[col].nunique()}')

workclass : 9
education : 16
marital-status : 7
occupation : 15
relationship : 6
race : 5


In [10]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 9)
    (2): Embedding(8, 5)
    (3): Embedding(16, 9)
    (4): Embedding(7, 4)
    (5): Embedding(6, 4)
    (6): Embedding(3, 2)
  )
  (emb_drop): Dropout(p=0.0)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=42, out_features=200, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=200, out_features=100, bias=True)
    (4): ReLU(inplace)
    (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=100, out_features=2, bias=True)
  )
)

In [11]:
learn.fit(1, 1e-2)

Total time: 00:04
epoch  train_loss  valid_loss  accuracy
1      0.366170    0.375010    0.820000  (00:04)



## Inference

In [12]:
row = df.iloc[0]

In [13]:
learn.predict(row)

(1, tensor(0), tensor([0.5740, 0.4260]))