# Import libraries

In [87]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
# from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from yahoo_fin import stock_info as si
from collections import deque

import numpy as np
import pandas as pd
import random
import os

In [88]:
# set seed to get stable results in training/testing run
np.random.seed(314)
# tf.random.set_seed(314)
random.seed(314)

Write a function to load and process a dataset with multiple features with the following
requirements:\
a. This function will allow you to specify the start date and the end date for the whole
dataset as inputs.\
    b. This function will allow you to deal with the NaN issue in the data.\
    c. This function will also allow you to use different methods to split the data into
train/test data; e.g. you can split it according to some specified ratio of train/test and
you can specify to split it by date or randomly.\
    d. This function will have the option to allow you to store the downloaded data on your
local machine for future uses and to load the data locally to save time.\
    e. This function will also allow you to have an option to scale your feature columns and
store the scalers in a data structure to allow future access to these scalers.

In [89]:
def shuffle_in_unison(a, b):
    # shuffle two arrays in the same way
    state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(state)
    np.random.shuffle(b)

def load_data(ticker, TEST_START='2020-01-01', TEST_END='2023-01-01', n_steps=50, scale=True, shuffle=True, lookup_step=1, dropNaN=True, split_by_date=True, test_size=0.2, feature_columns=["adjclose", "volume", "open", "high", "low"]):
    """
    Loads data from yahoo finance, then scaling, shuffle, and normalization.
    :param ticker: (str/pd.DataFrame), the ticker you want to load, like META, AAPL,...
    :param TEST_START: str, the start date of the test data (format: "YYYY-MM-DD")
    :param TEST_END: str, the end date of the test data (format: "YYYY-MM-DD")
    :param n_steps: int, the historical sequence length (i.e window size) use to predict, default is 50
    :param scale: whether to scale prices between 0 and 1, default is True
    :param shuffle: bool, whether to shuffle the dataset (both training and testing), default is True
    :param lookup_step: int, the future lookup step to predict, default is 1 (e.g next day)
    :param dropNaN: bool, whether to drop NaN values, default is True
    :param split_by_date: bool, whether to split the data into training and testing by date, default is True
    :param test_size: ratio of test data to train data, default is 0.2 (20% test data)
    :param feature_columns: the list of features to feed into the model, default is everything grabbed from yahoo_fin
    :return: 
    """
    
    # ------------------------------------------------------------------------------------------------#
    # Ticket to csv file, put it into folder dataset
    ticker_data_filename = os.path.join("dataset", f"{ticker}_{TEST_START}_{TEST_END}.csv") 
    ## a, see if ticker is already a loaded stock from yahoo finance
    if os.path.exists(ticker_data_filename):
        print(f'Loading data from {ticker_data_filename}')
        # read csv file, take date column as index
        df = pd.read_csv(ticker_data_filename, index_col=0)
    else:
        print(f'Downloading data for {ticker} from yahoo finance')
        if not os.path.exists("dataset"):
            os.makedirs("dataset")
        # download data from yahoo finance before assign into dataframe
        df = si.get_data(ticker, TEST_START, TEST_END)
         ## d, Store the download data locally for future use
        df.to_csv(ticker_data_filename)
    # ------------------------------------------------------------------------------------------------#






    # ------------------------------------------------------------------------------------------------#
    ## b, Allow you to deal with the NaN issue in the data
    # add the target column (label) by shifting by `lookup_step`
    df['future'] = df['adjclose'].shift(-lookup_step)

    # last `lookup_step` column contains NaN in the future column
    # get them before droping Nans
    last_sequence = np.array(df[feature_columns].tail(lookup_step))

    # drop NaNs or fill nan with median
    if dropNaN:
        df.dropna(inplace=True)
    else:
        df.fillna(df.mean())
    # ------------------------------------------------------------------------------------------------#




    # this will contain all the elements we want to return from this function
    result = {}

    # we will also return the original dataframe itself
    result['df'] = df.copy()




    # ------------------------------------------------------------------------------------------------#
    ## c, use different methods to split the data into train/test data; 
    # e.g. you can split it according to some specified ratio of train/test and you can specify to split it by date or randomly
    # make sure that the passed feature_columns exist in the dataframe
    for col in feature_columns:
        assert col in df.columns, f"'{col}' does not exist in the dataframe."
    # add date as a column
    if "date" not in df.columns:
        print(df.index)
        df['date'] = df.index

    # this will be the length of the sequences that we will use to predict future stock prices
    sequence_data = []
    # restrict the sequence maximum step that can be taken
    sequences = deque(maxlen=n_steps)
    # add the entry to the sequences, then check the sequence length before store target value into sequence_data
    for entry, target in zip(df[feature_columns + ["date"]].values, df['future'].values):
        sequences.append(entry)
        if len(sequences) == n_steps:
            sequence_data.append([np.array(sequences), target])
    # get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence 
    # for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 60 (that is 50+10) length
    # this last_sequence will be used to predict future stock prices that are not available in the dataset.
    last_sequence = list([s[:len(feature_columns)] for s in sequences]) + list(last_sequence)
    last_sequence = np.array(last_sequence).astype(np.float32)
    # add to result
    result['last_sequence'] = last_sequence
    
    # Construct the X's and y's
    X, y = [], []
    # iterate over the sequences and append to X and y
    for seq, target in sequence_data:
        X.append(seq)
        y.append(target)
    # convert to numpy arrays
    X = np.array(X)
    y = np.array(y)

    if split_by_date:
        # split the dataset into training & testing sets by date (not randomly splitting)
        train_samples = int((1 - test_size) * len(X))
        # :train_samples means from the beginning to train_samples, and train_samples: means from train_samples to the end
        result["X_train"] = X[:train_samples]
        result["y_train"] = y[:train_samples]
        result["X_test"] = X[train_samples:]
        result["y_test"] = y[train_samples:]
        if shuffle:
            # shuffle the datasets for training (if shuffle parameter is set)
            shuffle_in_unison(result["X_train"], result["y_train"])
            shuffle_in_unison(result["X_test"], result["y_test"])
    else:
        # split the dataset randomly
        result["X_train"], result["X_test"], result["y_train"], result["y_test"] = train_test_split(X, y, test_size=test_size, shuffle=shuffle)
    # ------------------------------------------------------------------------------------------------#



   
    # ------------------------------------------------------------------------------------------------#
    ## e, scale the feature columns and store the scalers in data structure 
    if scale:
        column_scaler = {}
        # scale the data (prices) from 0 to 1
        for column in feature_columns:
            scaler = preprocessing.MinMaxScaler()
            df[column] = scaler.fit_transform(np.expand_dims(df[column].values, axis=1))
            column_scaler[column] = scaler
        # add the MinMaxScaler instances to the result returned
        result["column_scaler"] = column_scaler
    # ------------------------------------------------------------------------------------------------#
    
    return result

In [90]:
# Amazon stock market
TICKER = "AMZN"

# Start and End date to read:
TEST_START = '2020-01-01'
TEST_END = '2023-01-01'

# Window size or the sequence length
N_STEPS = 50

# whether to scale feature columns & output price as well
SCALE = True

# whether to shuffle the dataset
SHUFFLE = True

# whether to split the training/testing set by date
SPLIT_BY_DATE = False

# test radio size, 0.2 is 20%
TEST_SIZE = 0.2

# features to use
FEATURE_COLUMNS = ["adjclose", "volume", "open", "high", "low"]

In [91]:
data = load_data(ticker=TICKER, TEST_START=TEST_START, TEST_END=TEST_END, n_steps=N_STEPS, scale=SCALE, lookup_step=1, dropNaN=True,
          shuffle=SHUFFLE, split_by_date=SPLIT_BY_DATE, test_size=TEST_SIZE, feature_columns=FEATURE_COLUMNS)

Downloading data for AMZN from yahoo finance
DatetimeIndex(['2020-01-02', '2020-01-03', '2020-01-06', '2020-01-07',
               '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-13',
               '2020-01-14', '2020-01-15',
               ...
               '2022-12-15', '2022-12-16', '2022-12-19', '2022-12-20',
               '2022-12-21', '2022-12-22', '2022-12-23', '2022-12-27',
               '2022-12-28', '2022-12-29'],
              dtype='datetime64[ns]', length=755, freq=None)


In [92]:
data["df"].head()

Unnamed: 0,open,high,low,close,adjclose,volume,ticker,future
2020-01-02,93.75,94.900497,93.207497,94.900497,94.900497,80580000,AMZN,93.748497
2020-01-03,93.224998,94.309998,93.224998,93.748497,93.748497,75288000,AMZN,95.143997
2020-01-06,93.0,95.184502,93.0,95.143997,95.143997,81236000,AMZN,95.343002
2020-01-07,95.224998,95.694504,94.601997,95.343002,95.343002,80898000,AMZN,94.598503
2020-01-08,94.902,95.550003,94.321999,94.598503,94.598503,70160000,AMZN,95.052498
