# Accessing non-curated tabular datasets
Example of making a dataset that is not curated by fastai available for training a fastai deep learning application.

In this notebook we'll go through the steps in ingest the Kaggle house prices dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data



In [114]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *


In [115]:
# imports required for this notebook
from kaggle import api

In [116]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Accessing a Kaggle dataset

The following cells assume that you have completed the following steps:
- Created a Kaggle ID, if you don't already have one: https://www.kaggle.com/account/login
- Log into your Kaggle ID and go through the steps to download your Kaggle API key file: kaggle.json
- Uploaded your kaggle.json file to the directory /root/.kaggle in your Gradient instance


In [117]:
# copy the contents of your kaggle.json file into creds
creds = '{"username":"ryanmark","key":"2f8e19a853dc7ba2ac0cefe77facdf09"}'

In [118]:
# define the kaggle credentials path
cred_path = Path('~/.kaggle/kaggle.json').expanduser()


In [119]:
# define a target path for this house price dataset
path = URLs.path('house_price')


In [120]:
# create the target path for the dataset and copy it into /storage/archive on Gradient
if not path.exists():
    path.mkdir()
    api.competition_download_cli('house-prices-advanced-regression-techniques', path=path)
    file_extract(path/'house-prices-advanced-regression-techniques.zip')
    
    


# list the directory structure of the newly created dataset
path.ls(file_type='text')




(#4) [Path('/storage/archive/house_price/sample_submission.csv'),Path('/storage/archive/house_price/data_description.txt'),Path('/storage/archive/house_price/train.csv'),Path('/storage/archive/house_price/test.csv')]

In [121]:
# ingest the dataset into a Pandas dataframe
df_train = pd.read_csv(path/'train.csv')

In [122]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [123]:
df_train.shape

(1460, 81)

In [124]:
df_test = pd.read_csv(path/'test.csv')
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [125]:
# note the shape of test - why does it have one less column than the train dataset?
df_test.shape

(1459, 80)

In [126]:
# define transforms to apply to the tabular dataset
procs = [FillMissing,Categorify]
# define the dependent variable (y value)
dep_var = 'SalePrice'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df_train, 1, dep_var=dep_var) 

In [127]:
# select a subset of columns to train the model on
cat_select = ['Neighborhood','HouseStyle','Exterior1st','CentralAir','KitchenQual']
cont_select = ['LotFrontage','LotArea','OverallCond','YearBuilt','GrLivArea','FullBath','HalfBath','BedroomAbvGr','GarageCars']

In [128]:
print("len cont is ",len(cont))
print("len cat is ",len(cat))

len cont is  37
len cat is  43


In [129]:
df_train['SalePrice'].value_counts()

140000    20
135000    17
145000    14
155000    14
190000    13
          ..
84900      1
424870     1
415298     1
62383      1
34900      1
Name: SalePrice, Length: 663, dtype: int64

# Set target
adjust target column for binary classification

In [130]:
# function to replace target values with value indicating whether the input is over or under the mean
def under_over(x,mean_x):
    if (x <= mean_x):
        returner = 0.0
    else:
        returner = 1.0
    return(returner)

In [131]:
# set target column
# df.loc[df.ID == 103, 'FirstName'] = "Matt"
mean_sp = int(df_train['SalePrice'].mean())
#df_train['SalePrice'] = df_train.loc[df_train.SalePrice <= mean_sp,'SalePrice'] = 0.0
#df_train['SalePrice'] = df_train.loc[df_train.SalePrice > mean_sp,'SalePrice'] = 1.0
# df['Date'] = df['Date'].apply(lambda x: int(str(x)[-4:]))
df_train['SalePrice'] = df_train['SalePrice'].apply(lambda x: under_over(x,mean_sp))
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,1.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,1.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,1.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,0.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,1.0


In [132]:
mean_sp

180921

In [133]:
df_train['SalePrice'].value_counts()

0.0    900
1.0    560
Name: SalePrice, dtype: int64

# Check for missing values

In [134]:
# df_train.isnull().sum() > 0
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [135]:
df_train_missing

Unnamed: 0,missing_count,missing_ratio
LotFrontage,259,0.177397
Alley,1369,0.937671
MasVnrType,8,0.005479
MasVnrArea,8,0.005479
BsmtQual,37,0.025342
BsmtCond,37,0.025342
BsmtExposure,38,0.026027
BsmtFinType1,37,0.025342
BsmtFinType2,38,0.026027
Electrical,1,0.000685


In [136]:
df_train_missing.shape

(19, 2)

In [137]:
count2 = df_test.isna().sum()
df_test_missing = (pd.concat([count2.rename('missing_count'),
                     count2.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count2.ne(0)])

In [138]:
df_test_missing

Unnamed: 0,missing_count,missing_ratio
MSZoning,4,0.002742
LotFrontage,227,0.155586
Alley,1352,0.926662
Utilities,2,0.001371
Exterior1st,1,0.000685
Exterior2nd,1,0.000685
MasVnrType,16,0.010966
MasVnrArea,15,0.010281
BsmtQual,44,0.030158
BsmtCond,45,0.030843


In [139]:
# check to see missing value col count in test set
df_test_missing.shape

(33, 2)

# Replace missing values

In [140]:

# for categorical columns, replace missing values with the most column categorical value in that column
df_train[cat] = df_train[cat].fillna(df_train[cat].mode().iloc[0])
df_test[cat] = df_test[cat].fillna(df_test[cat].mode().iloc[0])
# for continuous columns, replace missing values with 0
df_train[cont] = df_train[cont].fillna(0.0)
df_test[cont] = df_test[cont].fillna(0.0)


# Confirm missing values dealt with

In [141]:
# check for missing values in df_train
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [142]:
df_train_missing

Unnamed: 0,missing_count,missing_ratio


In [143]:
# check for missing values in df_test
count = df_test.isna().sum()
df_test_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [144]:
df_test_missing

Unnamed: 0,missing_count,missing_ratio


# define TabularDataLoaders

In [145]:
# define TabularDataLoaders object 
# valid_idx: the indices to use for the validation set
# what happens when we try to run this without dealing with missing values first
procs = [Categorify, Normalize]
dls_house=TabularDataLoaders.from_df(df_train,path,procs= procs, 
                               cat_names= cat, cont_names = cont, y_names = dep_var, valid_idx=list(range((df_train.shape[0]-100),df_train.shape[0])), bs=64)
                               

In [146]:
dls_house.valid.show_batch()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,SWISU,Feedr,Norm,1Fam,2Story,Gable,CompShg,MetalSd,Wd Sdng,,TA,TA,BrkTil,TA,Fa,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,BuiltIn,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1361.000017,70.0,51.0,9842.000033,5.0,6.0,1921.0,1998.0,-4e-06,5.079255e-08,-2e-06,611.999999,611.999995,990.000001,1610.999977,-1.084229e-07,2600.99996,2.498932e-09,1.579712e-09,3.0,1.0,4.0,1.0,8.0,-1.907239e-08,1998.000001,2.0,620.999994,182.999997,-2e-06,301.0,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,5.0,2008.0,1.0
1,RL,Pave,Grvl,IR1,Low,AllPub,Inside,Mod,StoneBr,Norm,Norm,1Fam,1Story,Hip,CompShg,VinylSd,VinylSd,Stone,Gd,TA,PConc,Ex,TA,Av,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1362.000021,20.0,124.0,16157.999818,7.0,5.0,2005.000002,2005.0,16.0,1274.0,-2e-06,256.000004,1530.000021,1529.999994,1.2e-05,-1.084229e-07,1530.0,1.0,1.579712e-09,2.0,-8.687398e-09,3.0,1.0,7.0,1.0,2004.999998,2.0,429.999999,168.000002,36.0,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,6.0,2009.0,1.0
2,RL,Pave,Grvl,IR1,Lvl,AllPub,FR2,Gtl,NAmes,Feedr,Norm,1Fam,1.5Fin,Gable,CompShg,VinylSd,VinylSd,,TA,Gd,BrkTil,TA,Fa,No,Unf,Unf,GasA,Gd,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1362.999978,50.0,9.840148e-07,12513.000003,4.0,4.0,1920.0,2007.0,-4e-06,5.079255e-08,-2e-06,714.999996,714.99999,1280.999998,456.999995,-1.084229e-07,1737.999994,2.498932e-09,1.579712e-09,2.0,-8.687398e-09,4.0,1.0,7.0,1.0,1920.0,1.0,368.000002,54.999999,-2e-06,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,6.0,2009.0,0.0
3,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,BuiltIn,Fin,TA,TA,Y,Gd,MnPrv,Shed,New,Partial,1363.999983,60.0,73.0,8498.999933,6.0,5.0,2006.000002,2007.0,-4e-06,5.079255e-08,-2e-06,616.0,615.999987,616.000004,796.000004,-1.084229e-07,1412.000004,2.498932e-09,1.579712e-09,2.0,1.0,3.0,1.0,6.0,1.0,2007.000005,2.0,432.000001,-3e-06,36.0,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,3.0,2007.0,0.0
4,FV,Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,TwnhsE,2Story,Gable,CompShg,MetalSd,MetalSd,,Gd,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Detchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Abnorml,1364.999987,160.000003,30.0,3180.00021,7.0,5.0,2005.000002,2005.0,-4e-06,5.079255e-08,-2e-06,600.000001,600.00002,519.999979,600.00001,80.0,1199.999988,2.498932e-09,1.579712e-09,2.0,1.0,2.0,1.0,4.0,-1.907239e-08,2004.999998,2.0,480.0,-3e-06,165.999996,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,4.0,2006.0,0.0
5,FV,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1365.999991,60.0,9.840148e-07,7499.999856,7.0,5.0,2000.0,2000.0,-4e-06,533.0,-2e-06,281.000003,813.999993,813.999994,860.000015,-1.084229e-07,1673.999994,1.0,1.579712e-09,2.0,1.0,3.0,1.0,7.0,-1.907239e-08,1999.999994,2.0,663.000006,-3e-06,96.000002,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,1.0,2010.0,1.0
6,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Abnorml,1366.999995,60.0,68.0,9179.000068,7.0,5.0,1999.0,1999.0,158.0,633.0,-2e-06,239.999988,873.000001,882.000002,908.000023,-1.084229e-07,1790.00001,1.0,1.579712e-09,2.0,1.0,3.0,1.0,7.0,-1.907239e-08,1999.000004,2.0,588.000001,-3e-06,88.000001,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,6.0,2008.0,1.0
7,RM,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,Gable,CompShg,CemntBd,CmentBd,,TA,TA,PConc,TA,TA,No,ALQ,Rec,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1367.999999,160.000003,41.0,2664.999956,5.0,6.0,1977.0,1977.0,-4e-06,548.0,172.999998,36.000025,757.000007,925.000009,549.999995,-1.084229e-07,1474.999998,2.498932e-09,1.579712e-09,2.0,-8.687398e-09,4.0,1.0,6.0,1.0,1977.000002,1.0,335.999994,104.0,25.999999,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,7.0,2006.0,0.0
8,RM,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,TwnhsE,1Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1369.000004,120.000002,9.840148e-07,4435.000005,6.0,5.0,2002.999998,2004.0,170.0,685.0,-2e-06,162.999992,848.000001,847.999998,1.2e-05,-1.084229e-07,847.999998,1.0,1.579712e-09,1.0,-8.687398e-09,1.0,1.0,4.0,-1.907239e-08,2003.000005,2.0,419.999999,139.999999,-2e-06,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,6.0,2009.0,0.0
9,RL,Pave,Grvl,IR2,Lvl,AllPub,FR2,Gtl,CollgCr,Norm,Norm,1Fam,1Story,Hip,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,BLQ,GLQ,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1370.000008,20.0,48.0,10634.999996,8.0,5.0,2002.999998,2003.0,171.000003,370.0,971.999988,315.00001,1657.000018,1668.000021,1.2e-05,-1.084229e-07,1667.999994,1.0,1.579712e-09,2.0,-8.687398e-09,3.0,1.0,8.0,1.0,2003.000005,2.0,502.000001,-3e-06,261.999996,3.581307e-07,-6.484072e-08,-4.139662e-07,-4.358864e-09,-9.080406e-07,5.0,2010.0,1.0


In [151]:
# define and fit the model
learn = tabular_learner(dls_house, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.170413,0.261378,0.61,00:00


# Apply trained model to the test dataset

In [152]:
# apply model to the test set
# details of test_dl here: https://docs.fast.ai/tutorial.tabular
dl = learn.dls.test_dl(df_test)

In [153]:
learn.get_preds(dl=dl)


(tensor([[-0.0106],
         [ 0.4776],
         [ 0.1401],
         ...,
         [ 0.0540],
         [-0.0338],
         [ 0.2124]]),
 None)

In [42]:
??tabular_learner

[0;31mSignature:[0m
[0mtabular_learner[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdls[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlayers[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0memb_szs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_out[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_range[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss_func[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mopt_func[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mAdam[0m [0mat[0m [0;36m0x7ff836f7e820[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplitter[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mtrainable_params[0m [0mat[0m [0;36m0x7ff838c60ca0[0m[0;34m>[0m[0;34m,[0m[0;34