In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import datasets, linear_model

In [31]:
def read_goog_sp500_data():
    googFile = 'data/GOOG.csv'
    spFile = 'data/GSPC.csv'
    
    goog = pd.read_csv(googFile, sep=",", usecols=[0,6], names=['Date', 'Goog'], header=0)
    sp = pd.read_csv(spFile, sep=",", usecols=[0,6], names=['Date', 'SP500'], header=0)
    goog['SP500'] = sp['SP500']
    
    # Format the date date object into a datetime
    goog['Date'] = pd.to_datetime(goog['Date'], format='%Y/%m/%d')
    
    returns = goog[
        [key for key in dict(goog.dtypes) if dict(goog.dtypes)[key] in ['float64', 'int64']]].pct_change()
    print(returns)
    xData = np.array(returns['SP500'][1:])
    yData = np.array(returns['Goog'][1:])
    return xData, yData

In [32]:
xData, yData = read_goog_sp500_data()
googModel = linear_model.LinearRegression()
googModel.fit(xData.reshape(-1, 1), yData.reshape(-1, 1))
# print(googModel.coef_)
# print(googModel.intercept_)

         Goog     SP500
0         NaN       NaN
1   -0.266038 -0.137519
2    0.157814 -0.139216
3   -0.237414 -0.095379
4    0.129524 -0.071716
..        ...       ...
116  0.231549  0.070092
117  0.101180  0.093222
118  0.924289  0.919525
119 -0.579488 -0.518566
120 -0.928284 -0.960255

[121 rows x 2 columns]


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
W = tf.Variable(tf.zeros([1,1]))
b = tf.Variable(tf.zeros([1]))

x = tf.placeholder(tf.float32, [None, 1])

Wx = tf.matmul(x, W)

y = Wx + b

y_ = tf.placeholder(tf.float32, [None, 1])
cost = tf.reduce_mean(tf.square(y_ -y))

# train_step_constant = tf.train.GradientDescentOptimizer(0.1).minimize(cost)
# train_step_constant = tf.train.AdagradOptimizer(1).minimize(cost)
train_step_constant = tf.train.FtrlOptimizer(1).minimize(cost)

def trainWithOnePointPerEpoch(steps, train_step):
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(steps):
            xs = np.array([[xData[i % len(yData)]]])
            ys = np.array([[yData[i % len(yData)]]])
            
            feed = {x: xs, y_: ys}
            # this is the elephant in the room
            sess.run(train_step, feed_dict=feed)
            
            if (i + 1) % 1000 == 0:
                print("After %d iteration:" % i)
                print("W: %f" % sess.run(W))
                print("b: %f" % sess.run(b))
                
                print("cost: %f" % sess.run(cost, feed_dict=feed))
                

dataset_size = len(xData)

def trainWithMultiplePointsPerEpoch(steps, train_step, batch_size):
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        
        for i in range(steps):
            if dataset_size == batch_size:
                batch_start_idx = 0
            elif dataset_size < batch_size:
                raise ValueError("dataset_size: %d, must be greater than batch_size: %d" % (dataset_size, batch_size))
            else:
                batch_start_idx = (i * batch_size) % (dataset_size)
            
            batch_end_idx = batch_start_idx + batch_size
            
            batch_xs = xData[batch_start_idx : batch_end_idx]
            batch_ys = yData[batch_start_idx : batch_end_idx]
            
            feed = {x: batch_xs.reshape(-1, 1), y_:batch_ys.reshape(-1, 1)}
            
            sess.run(train_step, feed_dict=feed)
            
            if (i + 1) % 500 == 0:
                print("After %d iteration:" % i)
                print("W: %f" % sess.run(W))
                print("b: %f" % sess.run(b))
                
                print("cost: %f" % sess.run(cost, feed_dict=feed))
                

# trainWithOnePointPerEpoch(10000, train_step_constant)
trainWithMultiplePointsPerEpoch(5000, train_step_constant, len(xData))

After 499 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 999 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 1499 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 1999 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 2499 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 2999 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 3499 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 3999 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 4499 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299
After 4999 iteration:
W: 1.035515
b: 0.015565
cost: 0.048299


In [None]:
# Multiple regression
def read_xom_oil_nasdaq_data():
    def readFile(filename):
        data = pd.read_csv(filename, sep=",", usecols=[0, 7], name=['Date', 'Price'], header=0)
        # sort the data in acscending orderof date
        data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
        
        data = data.sort_values(['Date'], ascending=[True])
        returns = data[[key for key in dict(data.dtypes) if dict(data.dtypes)[key] in ['float64', 'int64']]].pct_changes()
        return np.array(returns['Price'])[1:]
    
    nasdaqData = readFile('data/GPSC.csv')
    oilData = readFile('data/USO.csv')
    xomData = readFile('data/XOM.csv')
    
    return (nasdaqData, oilData, xomData)