# Prepare the Data for Training

In [1]:
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git

Collecting git+https://github.com/AI4Finance-Foundation/FinRL.git
  Cloning https://github.com/AI4Finance-Foundation/FinRL.git to /tmp/pip-req-build-v7rxlyl_
  Running command git clone --filter=blob:none --quiet https://github.com/AI4Finance-Foundation/FinRL.git /tmp/pip-req-build-v7rxlyl_
  Resolved https://github.com/AI4Finance-Foundation/FinRL.git to commit a8874cff9e1d2b87e6d8f5d1ec96c2f4bd1c06ce
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting alpaca-py<0.38,>=0.37 (from finrl==0.3.8)
  Downloading alpaca_py-0.37.0-py3-none-any.whl.metadata (13 kB)
Collecting alpaca-trade-api<4,>=3 (from finrl==0.3.8)
  Downloading alpaca_trade_api-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting ccxt<4,>=3 (from finrl==0.3.8)
  Downloading ccxt-3.1.60-py2.py3-none-any.whl.metadata (108 kB)
Collecting elegantrl@ git+https://github.com/AI4Finance-Foundation/Elegant

In [2]:
import pandas as pd
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS
from finrl.config import *
import itertools

In [3]:
TRAIN_START_DATE = '2020-01-01'
TRADE_END_DATE = '2020-01-31'
aapl_df_yf = yf.download(tickers = "aapl", start=TRAIN_START_DATE, end=TRADE_END_DATE)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [7]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2020-07-01'
TRADE_START_DATE = '2020-07-01'
TRADE_END_DATE = '2021-10-29'

df_raw = YahooDownloader(start_date = TRAIN_START_DATE,
                     end_date = TRADE_END_DATE,
                     ticker_list = config_tickers.DOW_30_TICKER).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Shape of DataFrame:  (94301, 8)


In [8]:
df_raw.head()

Price,date,close,high,low,open,volume,tic,day
0,2009-01-02,2.727418,2.736135,2.559416,2.581055,746015200,AAPL,4
1,2009-01-02,40.791451,40.853685,39.933992,40.51485,6547900,AMGN,4
2,2009-01-02,14.891699,15.038074,14.175233,14.3062,10955700,AXP,4
3,2009-01-02,33.941097,34.173623,32.0884,32.103402,7010200,BA,4
4,2009-01-02,30.344683,30.389963,28.921568,29.050942,7117200,CAT,4


## Preprocess Data

In [9]:
fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list = INDICATORS,
                     use_vix=True,
                     use_turbulence=True,
                     user_defined_feature = False)

processed = fe.preprocess_data(df_raw)

Successfully added technical indicators


[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (3228, 8)
Successfully added vix
Successfully added turbulence index


In [10]:
list_ticker = processed["tic"].unique().tolist()
list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
combination = list(itertools.product(list_date,list_ticker))

processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
processed_full = processed_full[processed_full['date'].isin(processed['date'])]
processed_full = processed_full.sort_values(['date','tic'])

processed_full = processed_full.fillna(0)

In [11]:
processed_full.head()

Unnamed: 0,date,tic,close,high,low,open,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,AAPL,2.727418,2.736135,2.559416,2.581055,746015200.0,4.0,0.0,2.947757,2.622186,100.0,66.666667,100.0,2.727418,2.727418,39.189999,0.0
1,2009-01-02,AMGN,40.791451,40.853685,39.933992,40.51485,6547900.0,4.0,0.0,2.947757,2.622186,100.0,66.666667,100.0,40.791451,40.791451,39.189999,0.0
2,2009-01-02,AXP,14.891699,15.038074,14.175233,14.3062,10955700.0,4.0,0.0,2.947757,2.622186,100.0,66.666667,100.0,14.891699,14.891699,39.189999,0.0
3,2009-01-02,BA,33.941097,34.173623,32.0884,32.103402,7010200.0,4.0,0.0,2.947757,2.622186,100.0,66.666667,100.0,33.941097,33.941097,39.189999,0.0
4,2009-01-02,CAT,30.344683,30.389963,28.921568,29.050942,7117200.0,4.0,0.0,2.947757,2.622186,100.0,66.666667,100.0,30.344683,30.344683,39.189999,0.0


### Train test split

In [12]:
train = data_split(processed_full, TRAIN_START_DATE,TRAIN_END_DATE)
trade = data_split(processed_full, TRADE_START_DATE,TRADE_END_DATE)
print(len(train))
print(len(trade))

83897
9715


## Save the data

In [13]:
train.to_csv('train_data.csv')
trade.to_csv('trade_data.csv')