In [1]:
import pandas as pd
import numpy as np

In [2]:
b = 15
f = 4
value_num = b + f

In [3]:
df = pd.read_csv('./clean_dataframe')
df.set_index('Unnamed: 0', inplace=True)
df.index.names = ['date']
df.head()

Unnamed: 0_level_0,DIS
date,Unnamed: 1_level_1
2008-10-01,26.028379
2008-10-02,25.417543
2008-10-03,25.061228
2008-10-06,23.975292
2008-10-07,22.541527


## Window Batching

In [4]:
batches_by_equity = []

for equity in df.columns:
    batches = []
    for i in range(len(df) - value_num):
        batches.append(df[equity][(0+i):(value_num+i)])
    np.random.shuffle(batches)
    batches_by_equity.append(batches)

###### Batch dataframe structure: `[equity_name][shuffled_interval_time_sequence_dataframe]`

In [5]:
# all_the_batches\[msft\]\[batch_number_243\]
batches_by_equity[0][243]

date
2015-03-04     98.392662
2015-03-05     97.889359
2015-03-06     96.761627
2015-03-09     98.103737
2015-03-10     96.081261
2015-03-11     95.894875
2015-03-12     99.883873
2015-03-13     99.203514
2015-03-16    100.070274
2015-03-17     99.688148
2015-03-18    100.629478
2015-03-19    100.070274
2015-03-20    101.058205
2015-03-23    100.862495
2015-03-24     99.827957
2015-03-25     97.861412
2015-03-26     98.085091
2015-03-27     98.308769
2015-03-30     98.905266
Name: DIS, dtype: float64

In [6]:
proccessed_batches_by_equity = []
for equity_batch_set in batches_by_equity:
    processed_batch_set = []
    for batch in equity_batch_set:     
        full_seq = batch.pct_change().dropna(inplace=False)
        df_future = full_seq[-f:]
        df_past = full_seq[:b]

        min_max = max(abs(np.min(df_past)), abs(np.max(df_past)))
        df_past = np.interp(df_past, (-min_max, min_max), (-1, 1))
        df_future = np.interp(df_future, (-min_max, min_max), (-1, 1)) # future note: mayb find diff bounds than b_seq min_max
        processed_batch_set.append([df_past, df_future])
    proccessed_batches_by_equity.append(processed_batch_set)

###### Proccessed batches structure: `[equity number][sample number][past or future]`

## Train test split

In [7]:
dataset = []
for equity in proccessed_batches_by_equity:
    train_dataset = equity[0: int(0.8 * len(equity))]
    test_dataset = equity[int(0.8 * len(equity)):]
    equity_dataset = [train_dataset, test_dataset]
    dataset.append(equity_dataset)

###### dataset structure: `[equity number][train or test][sample number][past or future]`

In [8]:
# MSFT, train dataset, 246th sample
dataset[0][0][245]

[array([ 0.56701509,  0.11481071, -0.11467115, -0.37574573,  0.36678036,
        -0.14615776, -0.36592675,  0.96560233,  0.14544839, -0.82984677,
         0.83721172, -0.39419055, -1.        , -0.08423274,  0.4742075 ]),
 array([ 0.4742075 , -0.31455417,  0.43132636,  0.19895961])]

In [9]:
dataset = np.array(dataset)

In [10]:
np.save('./dataset', dataset)