In [None]:
# This is just a preamble that sets a bunch of options up.

# render graphs inline
%matplotlib inline

import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
path = '/Users/terence/Downloads/HistoricalQuotes (4).csv'

In [None]:
import csv
with open(path, 'r') as fi:
    reader = csv.reader(fi)
    for row in reader:
        print(', '.join(row))

### Look how _pretty_ this is!

In [None]:
dataset = pd.read_csv(path)
dataset.head()

In [None]:
# We can manipulate the data!
(dataset['high'] - dataset['low']).head()

In [None]:
# Sort by date
dataset = dataset[1:]
dataset.date = pd.to_datetime(dataset.date, format='%Y/%m/%d')
dataset.set_index('date', inplace=True)
dataset.sort_index(inplace=True)
dataset.head()

In [None]:
# Feature Scale Volume!
dataset['volume'] = dataset['volume'].astype(float) * 1e-6

In [None]:
# What does the volume look like?
dataset['volume'].plot()

## Plot prices!

In [None]:
dataset[['open', 'close', 'high', 'low']].plot()

# Let's predict closing prices with Machine Learning!

In [None]:
# split columns into two sets. 
# Explanatory variables, X. Dependent Variable, Y

X = dataset[['volume', 'open', 'high', 'low']]
Y = dataset['close']

display(X.head())
display(Y.head())

In [None]:
# Split the data into training and test sets

train_len = int(len(dataset) * .9)

X_train = X[:train_len]
Y_train = Y[:train_len]
X_test = X[train_len:]
Y_test = Y[train_len:]

print('Train size:', len(X_train))
print('Test size:', len(X_test))

In [None]:
# Import the algorithm!

from sklearn.linear_model import LinearRegression
help(LinearRegression)

In [None]:
# 1. Create the model

model = LinearRegression()

# 2. Fit the model

model.fit(X_train, Y_train)

# 3. Test it!

print(model.score(X_train, Y_train))
print(model.score(X_test, Y_test))


## How does the machine predict?

In [None]:
# Print the model equation

eq = "close = "
for i in range(len(X_train.columns)):
    eq += "%.6f * %s + " % (model.coef_[i], X_train.columns[i])
eq += "%.6f" % model.intercept_

print("Linear regression model:")
print(eq)

# Let's try a different data set! How about past prices?

In [None]:
days_to_look_back = 5
prev_close = dataset['close']

for i in range(1, days_to_look_back + 1):

    prev_close = dataset['close'].shift(i)
    
    dataset['prev_close_%d' % i] = prev_close

dataset.head()

In [None]:
# Drop NaN's

dataset.dropna(inplace=True)
dataset.head()

In [None]:
# Split the data into training and test sets

X2 = dataset[['prev_close_%d' % i for i in range(1, days_to_look_back + 1)]]

Y2 = dataset['close']

X2_train = X2[:train_len]
Y2_train = Y2[:train_len]
X2_test = X2[train_len:]
Y2_test = Y2[train_len:]

print('Train size:', len(X2_train))
print('Test size:', len(X2_test))

from sklearn.linear_model import LinearRegression

model2 = LinearRegression()
model2.fit(X2_train, Y2_train)

In [None]:
# Scatter the actual prices vs. predicted prices

Y2_pred = model2.predict(X2_test)
plt.scatter(x = Y2_pred, y = Y2_test, c = 'r')
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()

In [None]:
print(model2.score(X2_train, Y2_train))
print(model2.score(X2_test, Y2_test))

# Which days are great for stocks?

In [None]:
# What are returns?
dataset['return'] = (dataset['close'] - dataset['open'])/dataset['open']

In [None]:
# Define previous day open-to-close returns

days_to_look_back = 5
prev_return = dataset['return']

for i in range(1, days_to_look_back + 1):
    prev_return = dataset['return'].shift(i)
    dataset['prev_return_%d' % i] = prev_return
dataset.dropna(inplace=True)
print(dataset.head())

In [None]:
# Train a model for today's return based on previous day returns

X = dataset[['prev_return_%d' % i for i in range(1, days_to_look_back + 1)]]
Y = dataset['return']

X_train = X[:train_len]
Y_train = Y[:train_len]
X_test = X[train_len:]
Y_test = Y[train_len:]

print('Train size:', len(X_train))
print('Test size:', len(X_test))

model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
print(model.score(X_train, Y_train))
print(model.score(X_test, Y_test))

In [None]:
# Scatter the actual returns vs. predicted returns
Y_pred = model.predict(X_test)
Y_pred = pd.Series(Y_pred, index = Y_test.index)
plt.scatter(x=Y_pred, y=Y_test, c='r')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
dataset['return'].plot()

## What have we learned?