# Accessing non-curated tabular datasets
Example of making a dataset that is not curated by fastai available for training a fastai deep learning application.

In this notebook we'll go through the steps in ingest the Kaggle house prices dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data



In [36]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *


In [37]:
# imports required for this notebook
from kaggle import api

In [38]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Accessing a Kaggle dataset

The following cells assume that you have completed the following steps:
- Created a Kaggle ID, if you don't already have one: https://www.kaggle.com/account/login
- Log into your Kaggle ID and go through the steps to download your Kaggle API key file: kaggle.json
- Uploaded your kaggle.json file to the directory /root/.kaggle in your Gradient instance


In [39]:
# copy the contents of your kaggle.json file into creds
creds = '{"username":"ryanmark","key":"2f8e19a853dc7ba2ac0cefe77facdf09"}'

In [40]:
# define the kaggle credentials path
cred_path = Path('~/.kaggle/kaggle.json').expanduser()


In [41]:
# define a target path for this house price dataset
path = URLs.path('house_price')


In [42]:
# create the target path for the dataset and copy it into /storage/archive on Gradient
if not path.exists():
    path.mkdir()
    api.competition_download_cli('house-prices-advanced-regression-techniques', path=path)
    file_extract(path/'house-prices-advanced-regression-techniques.zip')
    
    


# list the directory structure of the newly created dataset
path.ls(file_type='text')




(#4) [Path('/storage/archive/house_price/sample_submission.csv'),Path('/storage/archive/house_price/data_description.txt'),Path('/storage/archive/house_price/train.csv'),Path('/storage/archive/house_price/test.csv')]

In [43]:
# ingest the dataset into a Pandas dataframe
df_train = pd.read_csv(path/'train.csv')

In [44]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [45]:
df_train.shape

(1460, 81)

In [46]:
df_test = pd.read_csv(path/'test.csv')
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [47]:
# note the shape of test - why does it have one less column than the train dataset?
df_test.shape

(1459, 80)

In [48]:
# define transforms to apply to the tabular dataset
procs = [FillMissing,Categorify]
# define the dependent variable (y value)
dep_var = 'SalePrice'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df_train, 1, dep_var=dep_var) 

In [49]:
# select a subset of columns to train the model on
cat_select = ['Neighborhood','HouseStyle','Exterior1st','CentralAir','KitchenQual']
cont_select = ['LotFrontage','LotArea','OverallCond','YearBuilt','GrLivArea','FullBath','HalfBath','BedroomAbvGr','GarageCars']

In [50]:
print("len cont is ",len(cont))
print("len cat is ",len(cat))

len cont is  37
len cat is  43


# Set target
adjust target column for binary classification

In [51]:
# set target column
# df.loc[df.ID == 103, 'FirstName'] = "Matt"
mean_sp = df_train['SalePrice'].mean()
df_train['SalePrice'] = df_train.loc[df_train.SalePrice <= mean_sp,'SalePrice'] = 0
df_train['SalePrice'] = df_train.loc[df_train.SalePrice > mean_sp,'SalePrice'] = 1
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,1
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,1


In [52]:
mean_sp

180921.19589041095

# Check for missing values

In [53]:
# df_train.isnull().sum() > 0
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [54]:
df_train_missing

Unnamed: 0,missing_count,missing_ratio
LotFrontage,259,0.177397
Alley,1369,0.937671
MasVnrType,8,0.005479
MasVnrArea,8,0.005479
BsmtQual,37,0.025342
BsmtCond,37,0.025342
BsmtExposure,38,0.026027
BsmtFinType1,37,0.025342
BsmtFinType2,38,0.026027
Electrical,1,0.000685


In [55]:
df_train_missing.shape

(19, 2)

In [56]:
count2 = df_test.isna().sum()
df_test_missing = (pd.concat([count2.rename('missing_count'),
                     count2.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count2.ne(0)])

In [57]:
df_test_missing

Unnamed: 0,missing_count,missing_ratio
MSZoning,4,0.002742
LotFrontage,227,0.155586
Alley,1352,0.926662
Utilities,2,0.001371
Exterior1st,1,0.000685
Exterior2nd,1,0.000685
MasVnrType,16,0.010966
MasVnrArea,15,0.010281
BsmtQual,44,0.030158
BsmtCond,45,0.030843


In [58]:
# check to see missing value col count in test set
df_test_missing.shape

(33, 2)

# Replace missing values

In [59]:

# for categorical columns, replace missing values with the most column categorical value in that column
df_train[cat] = df_train[cat].fillna(df_train[cat].mode().iloc[0])
df_test[cat] = df_test[cat].fillna(df_test[cat].mode().iloc[0])
# for continuous columns, replace missing values with 0
df_train[cont] = df_train[cont].fillna(0.0)
df_test[cont] = df_test[cont].fillna(0.0)


# Confirm missing values dealt with

In [60]:
# check for missing values in df_train
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [61]:
df_train_missing

Unnamed: 0,missing_count,missing_ratio


In [62]:
# check for missing values in df_test
count = df_test.isna().sum()
df_test_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [63]:
df_test_missing

Unnamed: 0,missing_count,missing_ratio


# define TabularDataLoaders

In [64]:
# define TabularDataLoaders object 
# valid_idx: the indices to use for the validation set
# what happens when we try to run this without dealing with missing values first
procs = [Categorify]
dls_house=TabularDataLoaders.from_df(df_train,path,procs= procs, 
                               cat_names= cat, cont_names = cont, y_names = dep_var, valid_idx=list(range((df_train.shape[0]-100),df_train.shape[0])), bs=64)
                               

In [65]:
dls_house.valid.show_batch()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,SWISU,Feedr,Norm,1Fam,2Story,Gable,CompShg,MetalSd,Wd Sdng,,TA,TA,BrkTil,TA,Fa,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,BuiltIn,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1361.0,70.0,51.0,9842.0,5.0,6.0,1921.0,1998.0,0.0,0.0,0.0,612.0,612.0,990.0,1611.0,0.0,2601.0,0.0,0.0,3.0,1.0,4.0,1.0,8.0,0.0,1998.0,2.0,621.0,183.0,0.0,301.0,0.0,0.0,0.0,0.0,5.0,2008.0,1.0
1,RL,Pave,Grvl,IR1,Low,AllPub,Inside,Mod,StoneBr,Norm,Norm,1Fam,1Story,Hip,CompShg,VinylSd,VinylSd,Stone,Gd,TA,PConc,Ex,TA,Av,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1362.0,20.0,124.0,16158.0,7.0,5.0,2005.0,2005.0,16.0,1274.0,0.0,256.0,1530.0,1530.0,0.0,0.0,1530.0,1.0,0.0,2.0,0.0,3.0,1.0,7.0,1.0,2005.0,2.0,430.0,168.0,36.0,0.0,0.0,0.0,0.0,0.0,6.0,2009.0,1.0
2,RL,Pave,Grvl,IR1,Lvl,AllPub,FR2,Gtl,NAmes,Feedr,Norm,1Fam,1.5Fin,Gable,CompShg,VinylSd,VinylSd,,TA,Gd,BrkTil,TA,Fa,No,Unf,Unf,GasA,Gd,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1363.0,50.0,0.0,12513.0,4.0,4.0,1920.0,2007.0,0.0,0.0,0.0,715.0,715.0,1281.0,457.0,0.0,1738.0,0.0,0.0,2.0,0.0,4.0,1.0,7.0,1.0,1920.0,1.0,368.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2009.0,1.0
3,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,BuiltIn,Fin,TA,TA,Y,Gd,MnPrv,Shed,New,Partial,1364.0,60.0,73.0,8499.0,6.0,5.0,2006.0,2007.0,0.0,0.0,0.0,616.0,616.0,616.0,796.0,0.0,1412.0,0.0,0.0,2.0,1.0,3.0,1.0,6.0,1.0,2007.0,2.0,432.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,3.0,2007.0,1.0
4,FV,Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,TwnhsE,2Story,Gable,CompShg,MetalSd,MetalSd,,Gd,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Detchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Abnorml,1365.0,160.0,30.0,3180.0,7.0,5.0,2005.0,2005.0,0.0,0.0,0.0,600.0,600.0,520.0,600.0,80.0,1200.0,0.0,0.0,2.0,1.0,2.0,1.0,4.0,0.0,2005.0,2.0,480.0,0.0,166.0,0.0,0.0,0.0,0.0,0.0,4.0,2006.0,1.0
5,FV,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1366.0,60.0,0.0,7500.0,7.0,5.0,2000.0,2000.0,0.0,533.0,0.0,281.0,814.0,814.0,860.0,0.0,1674.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,0.0,2000.0,2.0,663.0,0.0,96.0,0.0,0.0,0.0,0.0,0.0,1.0,2010.0,1.0
6,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Abnorml,1367.0,60.0,68.0,9179.0,7.0,5.0,1999.0,1999.0,158.0,633.0,0.0,240.0,873.0,882.0,908.0,0.0,1790.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,0.0,1999.0,2.0,588.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,1.0
7,RM,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,Gable,CompShg,CemntBd,CmentBd,,TA,TA,PConc,TA,TA,No,ALQ,Rec,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1368.0,160.0,41.0,2665.0,5.0,6.0,1977.0,1977.0,0.0,548.0,173.0,36.0,757.0,925.0,550.0,0.0,1475.0,0.0,0.0,2.0,0.0,4.0,1.0,6.0,1.0,1977.0,1.0,336.0,104.0,26.0,0.0,0.0,0.0,0.0,0.0,7.0,2006.0,1.0
8,RM,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,TwnhsE,1Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1369.0,120.0,0.0,4435.0,6.0,5.0,2003.0,2004.0,170.0,685.0,0.0,163.0,848.0,848.0,0.0,0.0,848.0,1.0,0.0,1.0,0.0,1.0,1.0,4.0,0.0,2003.0,2.0,420.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2009.0,1.0
9,RL,Pave,Grvl,IR2,Lvl,AllPub,FR2,Gtl,CollgCr,Norm,Norm,1Fam,1Story,Hip,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,BLQ,GLQ,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,Fin,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal,1370.0,20.0,48.0,10635.0,8.0,5.0,2003.0,2003.0,171.0,370.0,972.0,315.0,1657.0,1668.0,0.0,0.0,1668.0,1.0,0.0,2.0,0.0,3.0,1.0,8.0,1.0,2003.0,2.0,502.0,0.0,262.0,0.0,0.0,0.0,0.0,0.0,5.0,2010.0,1.0


In [69]:
# define and fit the model
# initial attempt messed up because of problem with filling in missing values incorrectly
class_weights=torch.FloatTensor([0.5, 0.5]).cuda()
learn = tabular_learner(dls_house, layers=[200,100], metrics=accuracy)
# learn = tabular_learner(dls_house, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(50)

epoch,train_loss,valid_loss,accuracy,time
0,1.015591,0.626835,0.0,00:00
1,0.873722,0.570889,0.0,00:00
2,0.6745,0.72435,0.0,00:00
3,0.452713,0.203485,0.0,00:00
4,0.291382,0.038936,0.0,00:00
5,0.18949,0.043278,0.0,00:00
6,0.125374,0.042385,0.0,00:00
7,0.08416,0.065415,0.0,00:00
8,0.057141,0.082532,0.0,00:00
9,0.039756,0.050462,0.0,00:00


# Apply trained model to the test dataset

In [67]:
# apply model to the test set
# details of test_dl here: https://docs.fast.ai/tutorial.tabular
dl = learn.dls.test_dl(df_test)

In [68]:
learn.get_preds(dl=dl)


(tensor([[0.8443],
         [1.3035],
         [0.9543],
         ...,
         [0.8592],
         [0.7544],
         [0.7532]]),
 None)

In [42]:
??tabular_learner

[0;31mSignature:[0m
[0mtabular_learner[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdls[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlayers[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0memb_szs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_out[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_range[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss_func[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mopt_func[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mAdam[0m [0mat[0m [0;36m0x7ff836f7e820[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplitter[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mtrainable_params[0m [0mat[0m [0;36m0x7ff838c60ca0[0m[0;34m>[0m[0;34m,[0m[0;34