# Assessing the suitability of a dataset for deep learning
This notebook shows two examples of applying a fastai deep learning model to a tabular dataset. The goal of the notebook is to show a contrast between a unsuccessful application and a successful application.

In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [3]:
# imports specifically needed for this notebook
! pip install pandas_datareader
import numpy as np
import pandas as pd
import os
import yaml
# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# For time stamps
from datetime import datetime


Collecting pandas_datareader
  Downloading pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 20.3 MB/s eta 0:00:01
Collecting lxml
  Downloading lxml-4.6.3-cp38-cp38-manylinux2014_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 20.4 MB/s eta 0:00:01
Installing collected packages: lxml, pandas-datareader
Successfully installed lxml-4.6.3 pandas-datareader-0.9.0


# Ingest the dataset
Uset the DataReader API to get a stock price dataset for a stock.

In [4]:
# Set up start and end times for the data load - a year previous
end_time = datetime.now()
start_time = datetime(end_time.year - 1, end_time.month, end_time.day)


In [8]:
# Use the DataReader API to get the stock prices for AZN (AstraZeneca) for the preceding year
df = DataReader('AZN', 'yahoo', start_time, end_time)
# df = DataReader('AZN',data_source='yahoo', start=start_time, end=end_time)
df.head()

RemoteDataError: Unable to read URL: https://finance.yahoo.com/quote/AZN/history?period1=1593835200&period2=1625457599&interval=1d&frequency=1d&filter=history
Response Text:
b'<!DOCTYPE html>\n  <html lang="en-us"><head>\n  <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n      <meta charset="utf-8">\n      <title>Yahoo</title>\n      <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">\n      <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n      <style>\n  html {\n      height: 100%;\n  }\n  body {\n      background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;\n      background-size: cover;\n      height: 100%;\n      text-align: center;\n      font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;\n  }\n  table {\n      height: 100%;\n      width: 100%;\n      table-layout: fixed;\n      border-collapse: collapse;\n      border-spacing: 0;\n      border: none;\n  }\n  h1 {\n      font-size: 42px;\n      font-weight: 400;\n      color: #400090;\n  }\n  p {\n      color: #1A1A1A;\n  }\n  #message-1 {\n      font-weight: bold;\n      margin: 0;\n  }\n  #message-2 {\n      display: inline-block;\n      *display: inline;\n      zoom: 1;\n      max-width: 17em;\n      _width: 17em;\n  }\n      </style>\n  <script>\n    document.write(\'<img src="//geo.yahoo.com/b?s=1197757129&t=\'+new Date().getTime()+\'&src=aws&err_url=\'+encodeURIComponent(document.URL)+\'&err=%<pssc>&test=\'+encodeURIComponent(\'%<{Bucket}cqh[:200]>\')+\'" width="0px" height="0px"/>\');var beacon = new Image();beacon.src="//bcn.fp.yahoo.com/p?s=1197757129&t="+new Date().getTime()+"&src=aws&err_url="+encodeURIComponent(document.URL)+"&err=%<pssc>&test="+encodeURIComponent(\'%<{Bucket}cqh[:200]>\');\n  </script>\n  </head>\n  <body>\n  <!-- status code : 404 -->\n  <!-- Not Found on Server -->\n  <table>\n  <tbody><tr>\n      <td>\n      <img src="https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_205x58_frontpage.png" alt="Yahoo Logo">\n      <h1 style="margin-top:20px;">Will be right back...</h1>\n      <p id="message-1">Thank you for your patience.</p>\n      <p id="message-2">Our engineers are working quickly to resolve the issue.</p>\n      </td>\n  </tr>\n  </tbody></table>\n  </body></html>'

In [None]:
df.shape

In [None]:
# check for missing values
count = df.isna().sum()
df_missing = (pd.concat([count.rename('missing_count'),
                     count.div(len(df))
                          .rename('missing_ratio')],axis = 1)
             .loc[count.ne(0)])

In [None]:
# check for missing values
df_missing

# Build and train the first model
For the fist model, use fastai defaults for everything.

In [None]:
dep_var = 'Close'
# define columns that are continuous / categorical
cont,cat = cont_cat_split(df, 1, dep_var=dep_var) 
print("continuous columns are: ",cont)
print("categorical columns are: ",cat)

In [None]:
procs = [Normalize]
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)

In [None]:
dls.valid.show_batch()

In [None]:
# define and fit the model
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(3)

In [None]:
# start_time = datetime(end_time.year - 10, end_time.month, end_time.day)

# Build and train the second model
Revise the model:
- define a new target column to act as a categorical dependent variable (replacing the continuous dependent variable from the first model)
- explicitly select a subset of columns to train the model rather taking the set provided by default by cont_cat_split()

In [None]:
def get_target(value,threshold):
    '''return based on whether the input value is greater than or less than input threshold'''
    if value <= threshold:
        return_value = "0"
    else:
        return_value = "1"
    return(return_value)
    

In [None]:
threshold = 50.0
df['target'] = df['Close'].apply(lambda x: get_target(x,threshold))

In [None]:
df['target'].value_counts()

In [None]:
dep_var = 'target'

In [None]:
cont = ['High', 'Low', 'Open', 'Volume']

In [None]:
dls = TabularDataLoaders.from_df(df,procs= procs, 
                                 cat_names= cat, cont_names = cont, 
                                 y_names = dep_var, 
                                 valid_idx=list(range((df.shape[0]-50),df.shape[0])), bs=64)
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(30)