# Dataset Procurement

1. Input : Symbol
2. Output : [symbol]_input_signals.csv

## Imports

In [1]:
import yfinance as yf
import datetime as dt
from typing import List
import talib
from talib import abstract
import pandas as pd
import numpy as np

## Parameters

In [2]:
tickers = ["NZDUSD=X", "AUDUSD=X"]
interval = '1d'
start_date = dt.date(2006, 5, 20)
end_date = dt.date(2020, 12, 15)
include_indicators = True

## Utility Code

In [3]:
def all_technical_indicator_inputs(ohlc_data):

    inputs = {
        'open': ohlc_data['Open'],
        'high': ohlc_data['High'],
        'low': ohlc_data['Low'],
        'close': ohlc_data['Adj Close'],
        'volume': ohlc_data['Volume']
    }

    # Price Data
    def day_diff(diff_days: int):
        n_day_diff = ohlc_data["Adj Close"][diff_days:].to_numpy() - ohlc_data["Adj Close"][:-diff_days].to_numpy()
        n_day_diff = np.insert(n_day_diff, 0, [np.nan]*diff_days, axis=0)
        return n_day_diff

    for i in range(5):
        k = i+1
        ohlc_data["diff-day-" + str(k)] = day_diff(k)

    # Simple Indicators
    ohlc_data["EMA-3"] = ohlc_data["Adj Close"] - talib.EMA(ohlc_data["Adj Close"], 3)
    ohlc_data["EMA-7"] = ohlc_data["Adj Close"] - talib.EMA(ohlc_data["Adj Close"], 7)
    ohlc_data["EMA-21"] = ohlc_data["Adj Close"] - talib.EMA(ohlc_data["Adj Close"], 21)
    ohlc_data["EMA-50"] = ohlc_data["Adj Close"] - talib.EMA(ohlc_data["Adj Close"], 50)
    ohlc_data["RSI"] = talib.RSI(ohlc_data["Adj Close"], 14)/100
    ohlc_data["MACD"] = talib.MACD(ohlc_data["Adj Close"], 12, 26, 9)[2]
    ohlc_data["ADX"] = talib.ADX(ohlc_data["High"], ohlc_data["Low"], ohlc_data["Close"])/100
    ohlc_data["STD"] = talib.STDDEV(ohlc_data["Adj Close"])

    # Complex Indicators
    def get_useful_bbands():
        bb = talib.BBANDS(ohlc_data["Adj Close"])
        bb_width = bb[0] - bb[2]
        bb_buy = ohlc_data["Adj Close"] - bb[0]
        bb_sell = bb[2] - ohlc_data["Adj Close"]
        return bb_width, bb_buy, bb_sell

    ohlc_data["bb_width"], ohlc_data["bb_buy"], ohlc_data["bb_sell"] = get_useful_bbands()

    return ohlc_data

In [4]:
def obtain_ohlc_data(
        tickers: List[str]=tickers,
        start: dt.datetime=start_date,
        end: dt.datetime=end_date,
        include_all_indicators=include_indicators,
        interval: str=interval
) -> List:

    ticker_data = []

    for e in tickers:

        data = yf.download(
            interval=interval,
            tickers=e,
            start=start,
            end=end
        )

        if include_all_indicators:
            data = all_technical_indicator_inputs(data)

        ticker_data.append(data)
    return ticker_data

## Main Logic

In [5]:
tick_data = obtain_ohlc_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [6]:
tick_data

[                Open      High       Low     Close  Adj Close  Volume  \
 Date                                                                    
 2006-05-19  0.621736  0.624103  0.615915  0.621891   0.621891       0   
 2006-05-22  0.620386  0.623791  0.614817  0.623014   0.623014       0   
 2006-05-23  0.622898  0.630795  0.620809  0.624103   0.624103       0   
 2006-05-24  0.624493  0.636294  0.623286  0.635405   0.635405       0   
 2006-05-25  0.635809  0.641643  0.633513  0.640902   0.640902       0   
 ...              ...       ...       ...       ...        ...     ...   
 2020-12-08  0.704002  0.705430  0.702500  0.704052   0.704052       0   
 2020-12-09  0.704072  0.709512  0.703700  0.704072   0.704072       0   
 2020-12-10  0.701740  0.708421  0.701538  0.701641   0.701641       0   
 2020-12-11  0.709280  0.711101  0.707559  0.709401   0.709401       0   
 2020-12-14  0.709900  0.712078  0.707769  0.709849   0.709849       0   
 
             diff-day-1  diff-day-2 

## Preprocessing

### Dropping NA

In [7]:
for i in range(len(tick_data)):
    tick_data[i] = tick_data[i].dropna()

### Exporting

In [8]:
for idx, ticker in enumerate(tickers):
    pd.DataFrame(tick_data[idx]).to_csv(
        r'./' + ticker + '_input_signals.csv',
        index=False,
        header=True
    )