In [1]:
import os
import glob

from random import random
import numpy as np
import pandas as pd
import math

In [2]:
csv_file = os.path.join('Data','daily_sp500_1998-2013','table_goog.csv')
days_window = 252
scale = True

In [3]:
data = pd.read_csv(csv_file, header=None, names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume'],
                parse_dates=['Date'])
data.set_index('Date', inplace=True)
del data['Time']

print(len(data))
data.head()

2260


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,100.01,104.06,95.96,100.76,20925213
2004-08-20,101.19,109.08,100.5,108.35,11224400
2004-08-23,110.76,113.48,109.56,109.95,8787658
2004-08-24,111.24,111.6,103.57,105.0,7384914
2004-08-25,104.96,108.0,103.88,105.96,4456538


In [4]:
# clear nan and 0 volume
data = data[~np.isnan(data.Volume)]
data.Volume.replace(0, 1, inplace=True)

# Calculate change
data['Change'] = (data.Close - data.Close.shift()) / data.Close.shift()

print(len(data))
data.head()

2260


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,100.01,104.06,95.96,100.76,20925213,
2004-08-20,101.19,109.08,100.5,108.35,11224400,0.075328
2004-08-23,110.76,113.48,109.56,109.95,8787658,0.014767
2004-08-24,111.24,111.6,103.57,105.0,7384914,-0.04502
2004-08-25,104.96,108.0,103.88,105.96,4456538,0.009143


In [5]:
MinPercentileDays = 100

# pctrank = lambda x: pd.Series(x).rank(pct=True).iloc[-1]
def pctrank(data):
    return pd.Series(data).rank(pct=True).iloc[-1]
    

data['ClosePctl'] = data.Close.expanding(MinPercentileDays).apply(pctrank)
data['VolumePctl'] = data.Volume.expanding(MinPercentileDays).apply(pctrank)
data.dropna(axis=0, inplace=True)

print(len(data))
data.head()

2161


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,ClosePctl,VolumePctl
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-10,194.54,198.1,191.83,194.89,7261378,0.006299,0.97,0.54
2005-01-11,195.54,197.71,193.18,193.66,6525067,-0.006311,0.930693,0.435644
2005-01-12,194.33,195.93,190.5,195.34,8018579,0.008675,0.970588,0.588235
2005-01-13,195.38,197.39,194.05,195.45,6351912,0.000563,0.970874,0.427184
2005-01-14,195.85,200.01,194.13,200.0,8227647,0.02328,0.990385,0.625


In [6]:
# Scale
Change = data.Change
if scale:
    mean_values = data.mean(axis=0)
    std_values = data.std(axis=0)
    data = (data - np.array(mean_values)) / np.array(std_values)
data['Change'] = Change

print(data)

                Open      High       Low     Close    Volume    Change  \
Date                                                                     
2005-01-10 -2.091139 -2.096571 -2.077672 -2.086718  0.622560  0.006299   
2005-01-11 -2.084515 -2.099147 -2.068700 -2.094865  0.434411 -0.006311   
2005-01-12 -2.092530 -2.110904 -2.086511 -2.083738  0.816046  0.008675   
2005-01-13 -2.085575 -2.101261 -2.062918 -2.083009  0.390165  0.000563   
2005-01-14 -2.082461 -2.083956 -2.062386 -2.052872  0.869468  0.023280   
2005-01-18 -2.049076 -2.050865 -2.032279 -2.028099  1.833863  0.018700   
2005-01-19 -2.024037 -2.049015 -2.045239 -2.070424  1.417181 -0.031364   
2005-01-20 -2.104122 -2.108791 -2.076542 -2.093408  0.965738 -0.017583   
2005-01-21 -2.091801 -2.114669 -2.097278 -2.126129  0.975394 -0.025480   
2005-01-24 -2.129889 -2.154497 -2.154169 -2.178720  2.226474 -0.042024   
2005-01-25 -2.172879 -2.201327 -2.180953 -2.204751  1.418492 -0.021713   
2005-01-26 -2.192288 -2.154365 -2.1619

In [7]:
min_values = data.min(axis=0)
max_values = data.max(axis=0)

print("Min ===================")
print(min_values)
print("Max ===================")
print(max_values)

Open         -2.218585
High         -2.226690
Low          -2.205676
Close        -2.220449
Volume       -1.010434
Change       -0.097671
ClosePctl    -3.372556
VolumePctl   -1.128832
dtype: float64
Open          2.763816
High          2.724438
Low           2.737641
Close         2.740433
Volume        8.951960
Change        0.198242
ClosePctl     0.923084
VolumePctl    2.917184
dtype: float64
