In [1]:
# Tensorflow / Keras
from tensorflow import keras # for building Neural Networks
print('Tensorflow/Keras: %s' % keras.__version__) # print version
from keras.models import Sequential # for creating a linear stack of layers for our Neural Network
from keras import Input # for instantiating a keras tensor
from keras.layers import Dense, SimpleRNN # for creating regular densely-connected NN layers and RNN layers

# Data manipulation
import pandas as pd # for data manipulation
print('pandas: %s' % pd.__version__) # print version
import numpy as np # for data manipulation
print('numpy: %s' % np.__version__) # print version
import math # to help with data reshaping of the data

# Sklearn
import sklearn # for model evaluation
print('sklearn: %s' % sklearn.__version__) # print version
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import mean_squared_error # for model evaluation metrics
from sklearn.preprocessing import MinMaxScaler # for feature scaling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor



Tensorflow/Keras: 2.11.0
pandas: 1.3.5
numpy: 1.21.5
sklearn: 1.0.2


In [2]:
import quandl
import numpy as np
import csv 

quandl.ApiConfig.api_key = ''


In [3]:
train_dataset_data = quandl.Dataset('WIKI/AAPL').data(params={ 'start_date':'2010-01-01', 'end_date':'2017-01-01', 'collapse':'weekly', 'transformation':'rdiff', 'rows':1000 })
test_dataset_data = quandl.Dataset('WIKI/AAPL').data(params={ 'start_date':'2017-01-01', 'end_date':'2018-01-01', 'collapse':'weekly', 'transformation':'rdiff', 'rows':1000 })
total_dataset_data = quandl.Dataset('WIKI/AAPL').data(params={ 'start_date':'2010-01-01', 'end_date':'2018-01-01', 'collapse':'weekly', 'transformation':'rdiff', 'rows':1000 })


train_np = train_dataset_data.to_numpy()
test_np = test_dataset_data.to_numpy()
total_np = total_dataset_data.to_numpy()


train_data = [["date",'adj_open','adj_high','adj_low','adj_close','adj_volume']]
test_data = [["date",'adj_open','adj_high','adj_low','adj_close','adj_volume']]
total_data = [["date",'adj_open','adj_high','adj_low','adj_close','adj_volume']]

for date in train_np:
    train_data.append([str(date[0])[:10],date[8],date[9],date[10],date[11],date[12]])
for date in test_np:
    test_data.append([str(date[0])[:10],date[8],date[9],date[10],date[11],date[12]])
for date in total_np:
    total_data.append([str(date[0])[:10],date[8],date[9],date[10],date[11],date[12]])

In [4]:
#write data gathered into CSV files
with open("./APPL_train.csv", 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile,lineterminator = '\n') 
    csvwriter.writerows(train_data)
with open("./APPL_test.csv", 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile, lineterminator = '\n') 
    csvwriter.writerows(test_data)
with open("./APPL_total.csv", 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile, lineterminator = '\n') 
    csvwriter.writerows(total_data)

In [None]:
#splits samples based off of months

train_samples_index = []
for index in range(len(train_data)-2):
    date_current = train_data[index+1][0]
    date_next = train_data[index+2][0]
    if date_current[:7] == date_next[:7]:
        print("same")
    else:
        train_samples_index.append(index+1)
        print(index+1)

test_samples_index = []
for index in range(len(test_data)-2):
    date_current = test_data[index+1][0]
    date_next = test_data[index+2][0]
    if date_current[:7] == date_next[:7]:
        print("same")
    else:
        test_samples_index.append(index+1)
        print(index+1)

In [6]:
#gets indexes for samples based off grouping of 4weeks input, 1 week output

train_samples_index = []
for i in range(len(train_data) - 1):
    if i % 5 == 0 and i != 0:
        train_samples_index.append(i)

test_samples_index = []
for i in range(len(test_data) - 1):
    if i % 5 == 0 and i != 0:
        test_samples_index.append(i)

total_samples_index = []
for i in range(len(total_data) - 1):
    if i % 5 == 0 and i != 0:
        total_samples_index.append(i)

In [7]:
#splits samples based on indexes 

train_samples = []
start_index = 1
end_index = 1
for indexes in train_samples_index:
    end_index = indexes + 1
    train_samples.append(train_data[start_index:end_index])
    start_index = end_index

total_samples = []
start_index = 1
end_index = 1
for indexes in total_samples_index:
    end_index = indexes + 1
    total_samples.append(total_data[start_index:end_index])
    start_index = end_index

print()




In [8]:
#make X and y data ~ X (stock data) ~ y (resulting week's open/close)

X_data = []
y_data = []
goal = "close_price"
for sample in total_samples:
    #remove "date" from X_data
    stock_data = sample[:-1]
    revised_data = []
    open = []
    high = []
    low = []
    close = []
    volume = []
    for date in stock_data:
        open.append(date[1])
        high.append(date[2])
        low.append(date[3])
        close.append(date[4])
        volume.append(date[5])

    avg_open = np.average(open)
    avg_high = np.average(high)
    avg_low = np.average(low)
    avg_close = np.average(close)
    avg_volume = np.average(volume)

    week_data = [avg_open,avg_high,avg_low,avg_close,avg_volume]
    X_data.append(week_data)
    
    if goal == "open_price":
        y_data.append(float(sample[-1][1]))
    elif goal == "close_price":
        y_data.append(float(sample[-1][4]))

print()




In [10]:
#gets LOOCV indexes for train and test sets 

LOOCV_indexes = []
for i in range(len(X_data)):
    train_index = []
    for x in range(len(X_data)):
        if i != x:
            train_index.append(x)
    LOOCV_indexes.append((train_index,i))
print()




In [12]:
#runs Gradient Boosted Regressor and gets difference values for each stock

total_diff = []
total_result = []
for fold in LOOCV_indexes:
    #gets correct samples for LOOCV train & test
    clf_train_X_data = []
    clf_train_y_data = []
    clf_test_X_data = []
    clf_test_y_data = []
    for index in fold[0]:
        clf_train_X_data.append(X_data[index])
        clf_train_y_data.append(y_data[index])
    clf_test_X_data.append(X_data[fold[1]])
    clf_test_y_data.append(y_data[fold[1]])

    clf = GradientBoostingClassifier(n_estimators=10, learning_rate=0.25,max_depth=1, random_state=0)
    clf = GradientBoostingRegressor(random_state=0)

    clf.fit(clf_train_X_data,clf_train_y_data)

    predicted_value = clf.predict(clf_test_X_data)[0]
    actual_value = clf_test_y_data[0]

    difference = predicted_value - actual_value
    difference_precentage = difference / actual_value
    total_diff.append(difference_precentage)
    total_result.append([predicted_value,actual_value,difference_precentage])
    print(difference_precentage)

0.1805379479950512
-0.0894484035942062
0.01656581365015734
0.025209665029444946
-0.03550209127690363
-0.04039172223737765
-0.05762964824017409
-0.09089549367657378
0.0461011242757215
-0.03590480972693856
0.10727371395201321
-0.006631084073566009
0.008103081328596672
-0.0361480788240542
0.09988416752080308
-0.10777070123854414
0.12327187704816564
-0.137426901339377
0.026590239616176094
-0.006188532692199492
0.02502570701246999
-0.09503409579532388
-0.22188222198557953
0.03713732940993011
-0.05631289199768337
-0.039775584516180275
-0.0772181194372331
-0.031348381403516326
0.07217445536688222
0.06550840435064144
0.05139309786046097
0.206361794272836
0.11358782444575295
0.04018284901818687
-0.07351612861084855
0.06780987997577227
0.04996126204263258
-0.11439347695029282
-0.013287478629031717
-0.08413525264325929
0.025510252343555265
0.06986504201059647
-0.01457081622758076
0.05641815382210048
-0.042571912162653824
-0.03402348367416721
-0.033367588669337825
0.0012010151916563932
-0.02096124

In [13]:
avg_diff = np.average(total_diff)
overEstimate = []
underEstimate = []

for diff in total_diff:
    if diff > 0:
        overEstimate.append(diff)
    else:
        underEstimate.append(diff)

avg_over_diff = np.average(overEstimate)
avg_under_diff = np.average(underEstimate)
print("avg_diff: " + str(avg_diff))
print("avg_over_estimate: " + str(avg_over_diff))
print("avg_under_estimate: " + str(avg_under_diff))

avg_diff: 0.0017531120944778407
avg_over_estimate: 0.0568757621294449
avg_under_estimate: -0.05205709389203764
