# Assessing the suitability of a dataset for deep learning
This notebook shows two examples of applying a fastai deep learning model to a tabular dataset. The goal of the notebook is to show a contrast between a unsuccessful application and a successful application.

In [36]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *

In [37]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [38]:
# imports specifically needed for this notebook
! pip install pandas_datareader
import numpy as np
import pandas as pd
import os
import yaml
# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# For time stamps
from datetime import datetime




# Ingest the dataset
Uset the DataReader API to get a stock price dataset for a stock.

In [39]:
# Set up start and end times for the data load - a year previous
end_time = datetime.now()
start_time = datetime(end_time.year - 1, end_time.month, end_time.day)


In [40]:
# Use the DataReader API to get the stock prices for AZN (AstraZeneca)
df = DataReader('AZN', 'stooq')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-09,59.12,59.91,59.07,59.63,6173577
2021-07-08,59.45,59.67,58.83,59.26,6894951
2021-07-07,59.76,59.82,59.295,59.76,6434879
2021-07-06,60.41,60.555,59.87,59.9,6893637
2021-07-02,60.26,60.875,60.065,60.79,5198428


In [41]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-07-18,25.531,25.658,25.503,25.591,6983834
2016-07-15,25.48,25.513,25.231,25.33,8774132
2016-07-14,25.531,25.55,25.263,25.415,11345005
2016-07-13,25.389,25.528,25.255,25.263,11485284
2016-07-12,25.021,25.389,24.987,25.299,13560891


In [42]:
df.shape

(1258, 5)

In [43]:
# check for missing values
count = df.isna().sum()
df_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [44]:
# check for missing values
df_missing

Unnamed: 0,missing_count,missing_ratio


# Build and train the first model
For the fist model, use fastai defaults for everything.

In [45]:
dep_var = 'Close'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 
print("continuous columns are: ",cont)
print("categorical columns are: ",cat)

continuous columns are:  ['Open', 'High', 'Low', 'Volume']
categorical columns are:  []


In [46]:
procs = [Normalize]
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)

In [47]:
dls.valid.show_batch()

Unnamed: 0,Open,High,Low,Volume,Close
0,28.834,28.903,28.503,4041644.0,28.554001
1,28.523,28.632,28.395001,3924227.0,28.443001
2,28.954001,28.979999,28.58,6074469.0,28.700001
3,28.165,28.699,28.003001,7098700.0,28.563999
4,27.927999,28.232999,27.859,8215522.0,28.087999
5,27.884001,27.991,27.563,6874591.0,27.783001
6,27.605,28.292,27.554001,4246458.0,28.207001
7,27.876,27.876,27.312,5263818.0,27.323
8,27.859,28.070999,27.85,7152325.0,28.039
9,28.063,28.180001,27.97,5288114.0,28.157


In [48]:
# define and fit the model
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,1549.994263,716.452759,0.0,00:00
1,1347.144165,532.618103,0.0,00:00
2,1167.776978,466.563904,0.0,00:00


In [49]:
# start_time = datetime(end_time.year - 10, end_time.month, end_time.day)

# Build and train the second model
Revise the model:
- define a new target column to act as a categorical dependent variable (replacing the continuous dependent variable from the first model)
- explicitly select a subset of columns to train the model rather taking the set provided by default by cont_cat_split()

In [50]:
def get_target(value,threshold):
    '''return based on whether the input value is greater than or less than input threshold'''
    if value <= threshold:
        return_value = "0"
    else:
        return_value = "1"
    return(return_value)
    

In [51]:
threshold = 50.0
df['target'] = df['Close'].apply(lambda x: get_target(x,threshold))

In [52]:
df['target'].value_counts()

0    1023
1     235
Name: target, dtype: int64

In [53]:
dep_var = 'target'

In [54]:
cont = ['High', 'Low', 'Open', 'Volume']

In [55]:
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy,time
0,0.658906,0.665818,0.8,00:00
1,0.587707,0.363118,1.0,00:00
2,0.489015,0.115964,1.0,00:00
3,0.384771,0.026609,1.0,00:00
4,0.293015,0.007825,1.0,00:00
5,0.233168,0.004037,1.0,00:00
6,0.19394,0.002121,1.0,00:00
7,0.157648,0.001454,1.0,00:00
8,0.138205,0.000799,1.0,00:00
9,0.123684,0.000631,1.0,00:00
