In [75]:
import os
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import tensorflow as tf
import numpy as np
import scipy as sp
import sklearn as sk

In [172]:
PROJECT_PATH = '/pine/scr/s/i/siyangj/DeepStock/FinalProject/'
DATA_PATH = os.path.join(PROJECT_PATH,'Data/')
RAW_DATA = os.path.join(DATA_PATH,'concat.csv')
NAME_DATA = os.path.join(DATA_PATH,'named.csv')
NAME_HDF5 = os.path.join(DATA_PATH,'named.h5')
NAME_HDF5_ID = 'NAME_DATA'
CLEAN_HDF5 = os.path.join(DATA_PATH,'clean.h5')
CLEAN_HDF5_ID = 'CLEAN_DATA'
STOCK_NAMES = ['AAPL','MSFT','AMZN','GOOG','BRKB',
              'FB','JNJ','JPM','XOM','V','WMT','BAC',
              'UNH','PFE','WFC','VZ','PG','CVX','T',
              'INTC','CSCO','HD','MA','KO','BA',
              'MRK','ORCL','DIS','CMCSA','PEP','C',
              'MCD','PM','DWDP','ABBV','NFLX','MDT',
              'ABT','NVDA','AMGN','NKE','ADBE','MO','LLY',
              'MMM','IBM','HON','UNP','ACN',
              'UTX']

In [6]:
df = pd.read_csv(RAW_DATA)

In [13]:
df.sort_values('Unnamed: 0',inplace=True)

In [14]:
df.rename(columns={'Unnamed: 0':'Time'},inplace=True)

In [18]:
df.set_index('Time',inplace=True)

In [24]:
cols_1 = df.columns.values

In [27]:
for c in cols_1:
    if c[0:7]=='Unnamed':
        df.drop(c,axis=1,inplace=True)

In [33]:
stock_tuples = []
stock_each = ['open','high','low','close','volume']

In [34]:
for s in STOCK_NAMES:
    for e in stock_each:
        stock_tuples.append((s,e))

In [37]:
index = pd.MultiIndex.from_tuples(stock_tuples)

In [39]:
df.columns=index

In [164]:
df = pd.read_hdf(NAME_HDF5,NAME_HDF5_ID)

In [80]:
## Ref: https://stackoverflow.com/a/6520696
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isnan(y), lambda z: z.nonzero()[0]

In [116]:
## Use cubic spline to interpolate/extrapolate missing values
def fill_na(y):
    nans, x = nan_helper(y)
    f = sp.interpolate.interp1d(x(~nans),y[~nans],
                                kind='cubic',bounds_error=False,fill_value='extrapolate')
    y[nans]= f(x(nans))
    return y

In [158]:
def deal_with_open_close(y,minimum=1e-3):
    y = fill_na(y)
    m = y<=0
    if len(np.where(m)[0])>0:
        print(y[m])
        raise ValueError("Error in interpolation: produced nonpositive!!")
        print("Error in op/cl interpolation: produced nonpositive!!")
    y[m]=minimum
    return y

In [159]:
def deal_with_volume(y):
    y = fill_na(y)
    m = y<=0
    if len(np.where(m)[0])>0:
        print(y[m])
        print("Error in volume interpolation: produced nonpositive!!")
    ## Set non-positive values to mean
    y[m]=np.mean(y)
    return y

In [160]:
def deal_with_high_low(y,op,cl,ishigh,minimum=1e-3):
    y = deal_with_open_close(y,minimum=minimum)
    if ishigh:
        y = np.maximum.reduce([y,op,cl])
    else:
        y = np.minimum.reduce([y,op,cl])
    return y

In [165]:
# open, high, low, close, volume
def df_fill_na():
    for i in range(50):
        col = i*5
        # first deal with open and close
        try:
            op = deal_with_open_close(df.iloc[:,col].values)
            cl = deal_with_open_close(df.iloc[:,col+3].values)
            # then deal with volume
            vo = deal_with_volume(df.iloc[:,col+4].values)
            # then deal with high and low
            hi = deal_with_high_low(df.iloc[:,col+1].values,op,cl,True)
            lo = deal_with_high_low(df.iloc[:,col+2].values,op,cl,False)
        except ValueError as ve:
            print(col)
            raise ve
        df.iloc[:,col] = op
        df.iloc[:,col+1] = hi
        df.iloc[:,col+2] = lo
        df.iloc[:,col+3] = cl
        df.iloc[:,col+4] = vo

In [166]:
df_fill_na()

[-213.92621137]
Error in volume interpolation: produced nonpositive!!
[0. 0.]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[0 0 0]
Error in volume interpolation: produced nonpositive!!
[-61967.4440437]
Error in volume interpolation: produced nonpositive!!
[-1789087.00053572]
Error in volume interpolation: produced nonpositive!!
[-1628.26772703]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[-48771.87858439]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[0.]
Error in volume interpolation: produced nonpositive!!
[0. 0.]
Error in volume interpolation: produced nonpositive!!
[ -897.90790326 -2903.44845235 -1485.09416381 -1580.12039935
   -25.7925413   -168.10904142]
Error in 

In [173]:
df.to_hdf(CLEAN_HDF5,CLEAN_HDF5_ID)

In [174]:
df = pd.read_hdf(CLEAN_HDF5,CLEAN_HDF5_ID)

In [176]:
np.where(df<=0)

(array([], dtype=int64), array([], dtype=int64))

In [178]:
np.where(df.isna())

(array([], dtype=int64), array([], dtype=int64))