# Accessing non-curated tabular datasets
Example of making a dataset that is not curated by fastai available for training a fastai deep learning application.

In this notebook we'll go through the steps in ingest the Kaggle house prices dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data



In [19]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
!pip install fastdownload
import fastbook
from fastbook import *
from fastai.tabular.all import *
from fastdownload import FastDownload




In [20]:
# imports required for this notebook
from kaggle import api

In [21]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Accessing a Kaggle dataset

The following cells assume that you have completed the following steps:
- Created a Kaggle ID, if you don't already have one: https://www.kaggle.com/account/login
- Log into your Kaggle ID and go through the steps to download your Kaggle API key file: kaggle.json
- Uploaded your kaggle.json file to the directory /root/.kaggle in your Gradient instance


In [22]:
# copy the contents of your kaggle.json file into creds
creds = '{"username":<YOUR ID>,"key":<YOUR KEY>}'

In [23]:
# define the kaggle credentials path
cred_path = Path('~/.kaggle/kaggle.json').expanduser()


In [24]:
# define a target path for this house price dataset
path = URLs.path('house_price')


In [25]:
# need an explicit definition of file_extract
def file_extract(fname, dest=None):
    "Extract `fname` to `dest` using `tarfile` or `zipfile`."
    if dest is None: dest = Path(fname).parent
    fname = str(fname)
    if   fname.endswith('gz'):  tarfile.open(fname, 'r:gz').extractall(dest)
    elif fname.endswith('zip'): zipfile.ZipFile(fname     ).extractall(dest)
    else: raise Exception(f'Unrecognized archive: {fname}')

In [26]:
# create the target path for the dataset and copy it into /storage/archive on Gradient
if not path.exists():
    print('path does not exist')
    path.mkdir()
    api.competition_download_cli('house-prices-advanced-regression-techniques', path=path)
    #d = FastDownload()
    #d.get(path/'house-prices-advanced-regression-techniques.zip')
    file_extract(path/'house-prices-advanced-regression-techniques.zip')
    
    


# list the directory structure of the newly created dataset
path.ls(file_type='text')




path does not exist


100%|██████████| 199k/199k [00:00<00:00, 5.69MB/s]

Downloading house-prices-advanced-regression-techniques.zip to /root/.fastai/archive/house_price






(#4) [Path('/root/.fastai/archive/house_price/train.csv'),Path('/root/.fastai/archive/house_price/test.csv'),Path('/root/.fastai/archive/house_price/sample_submission.csv'),Path('/root/.fastai/archive/house_price/data_description.txt')]

# Ingest and explore the dataset
In this dataset the train and test subsets are in separate CSV files. Ingest each of these and explore them

In [None]:
# ingest the dataset into a Pandas dataframe
df_train = pd.read_csv(path/'train.csv')

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test = pd.read_csv(path/'test.csv')
df_test.head()

In [None]:
# note the shape of test - why does it have one less column than the train dataset?
df_test.shape

# Set target
adjust target column for binary classification

In [None]:
# function to replace target values with value indicating whether the input is over or under the mean
def under_over(x,mean_x):
    if (x <= mean_x):
        returner = '0'
    else:
        returner = '1'
    return(returner)

In [None]:
# get the average of the values in the SalePrice column
mean_sp = int(df_train['SalePrice'].mean())
# use the under_over() function to replace the values in the SalePrice column with indicators whether the value was over or under
# the average for the SalePrice column
df_train['SalePrice'] = df_train['SalePrice'].apply(lambda x: under_over(x,mean_sp))
df_train.head()

In [None]:
mean_sp

In [None]:
df_train['SalePrice'].value_counts()

# Define target, categorical and continuous columns

In [None]:
# define the dependent variable (y value)
dep_var = 'SalePrice'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df_train, 1, dep_var=dep_var) 

In [None]:
print("len cont is ",len(cont))
print("len cat is ",len(cat))

In [None]:
df_train[cat].nunique()

In [None]:
df_test[cat].nunique()

# Check for missing values

In [None]:
# create a dataframe that has rows for each column in df_train with missing values and 
# columns for the count and ratio of missing values
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [None]:
df_train_missing.head()

In [None]:
df_train_missing.shape

In [None]:
count2 = df_test.isna().sum()
df_test_missing = (pd.concat([count2.rename('missing_count'),
                     count2.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count2.ne(0)])

In [None]:
df_test_missing.head()

In [None]:
# check to see missing value col count in test set
df_test_missing.shape

# Replace missing values

In [None]:

# for categorical columns, replace missing values with the most column categorical value in that column
df_train[cat] = df_train[cat].fillna(df_train[cat].mode().iloc[0])
df_test[cat] = df_test[cat].fillna(df_test[cat].mode().iloc[0])
# for continuous columns, replace missing values with 0
df_train[cont] = df_train[cont].fillna(0.0)
df_test[cont] = df_test[cont].fillna(0.0)


# Confirm missing values dealt with

In [None]:
# check for missing values in df_train
count = df_train.isna().sum()
df_train_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_train))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [None]:
df_train_missing

In [None]:
# check for missing values in df_test
count = df_test.isna().sum()
df_test_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df_test))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [None]:
df_test_missing

# define TabularDataLoaders

In [None]:
# define TabularDataLoaders object 
# valid_idx: the indices to use for the validation set
# what happens when we try to run this without dealing with missing values first
procs = [Categorify, Normalize]
dls_house=TabularDataLoaders.from_df(
    df_train,path,procs= procs,
    cat_names= cat, cont_names = cont, y_names = dep_var, 
    valid_idx=list(range((df_train.shape[0]-100),df_train.shape[0])), 
    bs=64)
                       

In [None]:
dls_house.valid.show_batch()

# Define and train model

In [None]:
# define and fit the model
learn = tabular_learner(dls_house, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(5)

# Apply trained model to the test dataset

In [None]:
# apply model to the test set
# details of test_dl here: https://docs.fast.ai/tutorial.tabular
dl = learn.dls.test_dl(df_test)

In [None]:
learn.get_preds(dl=dl)


In [None]:
learn.show_results()

# Examine the structure of the trained model structure
Use the summary() function to see the structure of the trained model, including:
- the layers that make up the model
- total parameters
- loss function
- optimizer function
- callbacks

In [None]:
learn.summary()