# Using Kaggle Stock Data for NN Training

In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import os

This all assumes the raw data has been extracted to KaggleStocks/data_raw/

## Importing Daily Data

In [2]:
data_prices = pd.read_csv('../data_raw/prices-split-adjusted.csv')
data_prices.head(5)

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


In [3]:
def symbol_lookup(symbol):
    return data_prices[data_prices['symbol']==symbol]

def symbol_open(symbol):
    tmp = symbol_lookup(symbol)
    return tmp['open']

def symbol_open(symbol):
    tmp = symbol_lookup(symbol)
    return tmp['close']

def date_lookup(date):
    return data_prices[data_prices['date']==date]


Converting to a time relative to the first date available.

In [4]:
data_prices['date'] = pd.to_datetime(data_prices['date'])
min_date = data_prices['date'].min()
data_prices['date'] = (data_prices['date']-min_date) / np.timedelta64(1,'D')

Creating a map from a symbol to it's associated number (Tensorflow probs won't like strings).

In [5]:
symbol_map = dict()
ii = 0
for symbol in data_prices['symbol'].unique():
    symbol_map[symbol] = ii
    ii = ii + 1

In [6]:
data_prices['symbol']=data_prices['symbol'].replace(symbol_map)

Now we need to build our dictionary of data, i.e. a dictionary where dates are the keys and a 2D data array is the value.

In [7]:
# indices_list = ['symbol', 'open', 'close', 'low', 'high', 'volume']
indices_list   = ['date', 'open', 'close', 'low', 'high', 'volume']
num_dates      = len(data_prices['date'].unique())
num_stocks     = len(data_prices['symbol'].unique())
num_daily_data = len(indices_list)

data_array = np.zeros((num_dates, num_stocks, num_daily_data))
ii = 0
for date in sorted(data_prices['date'].unique()):
    temp1 = date_lookup(date)
    
    for symbol in sorted(temp1['symbol'].unique()):
        temp2 = temp1[temp1['symbol'] == symbol]
        
        data_array[ii,symbol,:] = temp2.as_matrix(indices_list)
    
    ii += 1
    
# data_dict = dict()
# for date in data_prices['date'].unique():
#     data_dict[date]=date_lookup(date).as_matrix(indices_list)
    

In [8]:
data_array[1761,5,:]


array([  2.55200000e+03,   7.93499980e+01,   7.81900020e+01,
         7.79599990e+01,   7.94899980e+01,   1.38750000e+06])

In [9]:
data_len = data_array.shape[0]

# test data length
train_len = round(3 * data_len / 4)

x_train = data_array[0:train_len, :, :]
# start with just predicting the opening value the next day
y_train = data_array[1:train_len + 1, :, 1]

x_test = data_array[train_len + 1:-1, :, :]
# start with just predicting the opening value the next day
y_test = data_array[train_len + 2:, :, 1]

# let's start by just flattening the data
x_train = np.reshape(x_train, (train_len, -1))
x_test = np.reshape(x_test, (len(y_test), -1))

In [38]:
temp3 = data_prices[data_prices['symbol'] == 100]
temp3[temp3['date']==1449.0]

Unnamed: 0,date,symbol,open,close,low,high,volume
474115,1449.0,100,73.610001,73.07,72.93,73.610001,427700.0


In [39]:
x_train[1000,602]

73.069999999999993

In [31]:
sio.savemat('../daily_data.mat', mdict={'data': data_array})

## Importing Yearly Fundamentals

In [None]:
data_fund = pd.read_csv('../data_raw/fundamentals.csv')
data_fund.head(5)

In [None]:
def fund_lookup(date):
    return data_fund[data_fund['Period Ending'] == date]

In [None]:
data_fund.keys()

In [None]:
data_fund['Period Ending'] = pd.to_datetime(data_fund['Period Ending'])
data_fund['Period Ending'] = (data_fund['Period Ending']-min_date) / np.timedelta64(1,'D')

In [None]:
data_fund['Ticker Symbol'] = data_fund['Ticker Symbol'].replace(symbol_map)

In [None]:
blah = data_fund.keys().drop(['Period Ending', 'Unnamed: 0'])

In [None]:
fund_dict = dict()
for date in data_fund['Period Ending'].unique():
    fund_dict[date]=fund_lookup(date).as_matrix(blah)

In [None]:
fund_dict.keys()

In [None]:
fund_dict[1818.0]