# Assessing the suitability of a dataset for deep learning
This notebook shows two examples of applying a fastai deep learning model to a tabular dataset. The goal of the notebook is to show a contrast between a unsuccessful application and a successful application.

In [None]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [3]:
# imports specifically needed for this notebook
! pip install pandas_datareader
import numpy as np
import pandas as pd
import os
import yaml
# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# For time stamps
from datetime import datetime


Collecting pandas_datareader
  Downloading pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 21.0 MB/s eta 0:00:01
Collecting lxml
  Downloading lxml-4.6.2-cp38-cp38-manylinux1_x86_64.whl (5.4 MB)
[K     |████████████████████████████████| 5.4 MB 38.3 MB/s eta 0:00:01
Installing collected packages: lxml, pandas-datareader
Successfully installed lxml-4.6.2 pandas-datareader-0.9.0


# Ingest the dataset
Uset the DataReader API to get a stock price dataset for a stock.

In [4]:
# Set up start and end times for the data load - a year previous
end_time = datetime.now()
start_time = datetime(end_time.year - 1, end_time.month, end_time.day)


In [5]:
# Use the DataReader API to get the stock prices for AZN (AstraZeneca) for the preceding year
df = DataReader('AZN', 'yahoo', start_time, end_time)
df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-06,50.099998,49.5,49.689999,50.029999,2531000.0,48.64201
2020-02-07,49.919998,49.369999,49.889999,49.400002,1995500.0,48.029491
2020-02-10,49.939999,49.349998,49.490002,49.91,1983200.0,48.525341
2020-02-11,49.98,49.540001,49.939999,49.779999,1848800.0,48.398945
2020-02-12,49.799999,49.360001,49.75,49.580002,2483000.0,48.204498


In [6]:
df.shape

(253, 6)

In [7]:
# check for missing values
count = df.isna().sum()
df_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [8]:
# check for missing values
df_missing

Unnamed: 0,missing_count,missing_ratio


# Build and train the first model
For the fist model, use fastai defaults for everything.

In [9]:
dep_var = 'Close'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 
print("continuous columns are: ",cont)
print("categorical columns are: ",cat)

continuous columns are:  ['High', 'Low', 'Open', 'Volume', 'Adj Close']
categorical columns are:  []


In [10]:
procs = [Normalize]
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)

In [11]:
dls.valid.show_batch()

Unnamed: 0,High,Low,Open,Volume,Adj Close,Close
0,54.299999,53.520001,54.0,8341400.0,53.57,53.57
1,53.09,52.299999,52.34,11660900.0,52.599998,52.599998
2,52.77,51.439999,51.580002,14783200.0,52.610001,52.610001
3,53.419998,52.529999,53.299999,14725200.0,52.939999,52.939999
4,53.360001,52.599999,53.060001,13308200.0,52.98,52.98
5,53.599998,52.830002,52.889999,4953700.0,53.389999,53.389999
6,52.959999,52.25,52.869999,7917100.0,52.779999,52.779999
7,54.139999,53.419998,53.549999,4685400.0,53.740002,53.740002
8,54.490002,53.830002,54.380001,6582900.0,54.259998,54.259998
9,55.060001,53.619999,53.669998,6707600.0,54.720001,54.720001


In [12]:
# define and fit the model
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,2691.044678,2632.11084,0.0,00:01
1,2670.847656,2620.096436,0.0,00:00
2,2652.538086,2613.420654,0.0,00:00


In [20]:
# start_time = datetime(end_time.year - 10, end_time.month, end_time.day)

# Build and train the second model
Revise the model:
- define a new target column to act as a categorical dependent variable (replacing the continuous dependent variable from the first model)
- explicitly select a subset of columns to train the model rather taking the set provided by default by cont_cat_split()

In [13]:
def get_target(value,threshold):
    '''return based on whether the input value is greater than or less than input threshold'''
    if value <= threshold:
        return_value = "0"
    else:
        return_value = "1"
    return(return_value)
    

In [14]:
threshold = 50.0
df['target'] = df['Close'].apply(lambda x: get_target(x,threshold))

In [16]:
df['target'].value_counts()

1    197
0     56
Name: target, dtype: int64

In [17]:
dep_var = 'target'

In [20]:
cont = ['High', 'Low', 'Open', 'Volume']

In [21]:
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy,time
0,0.77982,0.711409,0.42,00:00
1,0.758165,0.707407,0.44,00:00
2,0.723788,0.698016,0.56,00:00
3,0.67453,0.685289,0.58,00:00
4,0.618488,0.671475,0.58,00:00
5,0.561982,0.657754,0.58,00:00
6,0.510846,0.646289,0.58,00:00
7,0.464985,0.633201,0.58,00:00
8,0.422107,0.621735,0.64,00:00
9,0.383925,0.614889,0.66,00:00
