# Traditional Stock Price Prediction 

In [241]:
import numpy as np
import pandas as pd
from pandas_datareader import data as pdr
import fix_yahoo_finance as yf
import datetime

yf.pdr_override()

In [242]:
def computeClassification(actual):
    if actual > 0:
        return 1
    else:
        return -1

In [243]:
start = datetime.datetime(2010,1,1)
end = datetime.datetime(2019,5,31)
df = pdr.get_data_yahoo('AMZN', start=start, end=end)

[*********************100%***********************]  1 of 1 downloaded


In [244]:
#Calculate daily returns
df['returns'] = np.log(df['Close'] / df['Close'].shift(1))
df['returns'].fillna(0)
# calculate daily returns
df['returns'] = np.log(df['Close'] / df['Close'].shift(1))
df['returns'].fillna(0)
df.iloc[:, len(df.columns) - 1] = df.iloc[:, len(df.columns) - 1].apply(computeClassification)

In [245]:
#Compute the last column (Y) -1 = down, 1 = up by applying the defined classifier above to the 'returns_final' dataframe
df.iloc[:, len(df.columns) - 1] = df.iloc[:, len(df.columns) - 1].apply(computeClassification)

In [246]:
#Now that we have a complete dataset with a predictable value, the last column “Return” which is either -1 or 1, create the train and test dataset.
#Convert float to int so you can slice the dataframe
testData = df[-int((len(df) * 0.10)):]
#2nd half is forward tested on
trainData = df[:-int((len(df) * 0.90))]
#1st half is trained on


In [247]:
#Replace all inf with nan
testData_1 = testData.replace([np.inf, -np.inf], np.nan)
trainData_1 = trainData.replace([np.inf, -np.inf], np.nan)
#Replace all nans with 0
testData_2 = testData_1.fillna(0)
trainData_2 = trainData_1.fillna(0)

In [248]:
#Replace all inf with nan
testData_1 = testData.replace([np.inf, -np.inf], np.nan)
trainData_1 = trainData.replace([np.inf, -np.inf], np.nan)
#Replace all nans with 0
testData_2 = testData_1.fillna(0)
trainData_2 = trainData_1.fillna(0)

In [249]:
#X is the list of features 
data_X_train = trainData_2.iloc[:, 0:len(trainData_2.columns) - 1]
#Y is the 1 or -1 value to be predicted (as we added this for the last column above using the apply.(computeClassification) function
data_Y_train = trainData_2.iloc[:, len(trainData_2.columns) - 1]

In [250]:
#Test dataset
data_X_test = testData_2.iloc[:, 0:len(testData_2.columns) - 1]
data_Y_test = testData_2.iloc[:, len(testData_2.columns) - 1]

In [251]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def print_score(clf, data_X_train, data_y_train, data_X_test, data_y_test, train=True):
    if train == True:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(data_y_train, clf.predict(data_X_train))))
        print("Classification Report: \n {}\n".format(classification_report(data_y_train, clf.predict(data_X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(data_y_train, clf.predict(data_X_train))))
        res = cross_val_score(clf, data_X_train, data_y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    else:
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(data_y_test, clf.predict(data_X_test))))
        print("Classification Report: \n {}\n".format(classification_report(data_y_test,clf.predict(data_X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(data_y_test, clf.predict(data_X_test))))

In [252]:
#Logistic regression 
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(data_X_train, data_Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [253]:
#Predictions is an array containing the predicted values (-1 or 1) for the features in data_X_test.
#You can see the prediction accuracy using the method accuracy_score which compares the predicted values versus the expected ones.

from sklearn.metrics import accuracy_score

y_predictions = clf.predict(data_X_test)
#Predict y based on x_test
print("Accuracy Score Employing Machine Learning: " + str(accuracy_score(data_Y_test, y_predictions)))


Accuracy Score Employing Machine Learning: 0.5536480686695279


In [254]:
#Training Results
print(print_score(clf,data_X_train, data_Y_train, data_X_test, data_Y_test, True))

Train Result:

accuracy score: 0.5342

Classification Report: 
               precision    recall  f1-score   support

          -1       0.54      0.22      0.31       113
           1       0.53      0.83      0.65       121

   micro avg       0.53      0.53      0.53       234
   macro avg       0.54      0.52      0.48       234
weighted avg       0.54      0.53      0.49       234


Confusion Matrix: 
 [[ 25  88]
 [ 21 100]]

Average Accuracy: 	 0.5137
Accuracy SD: 		 0.0668
None


In [255]:
#Testing Results
print(print_score(clf,data_X_train, data_Y_train, data_X_test, data_Y_test, False))

Test Result:

accuracy score: 0.5536

Classification Report: 
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       104
           1       0.55      1.00      0.71       129

   micro avg       0.55      0.55      0.55       233
   macro avg       0.28      0.50      0.36       233
weighted avg       0.31      0.55      0.39       233


Confusion Matrix: 
 [[  0 104]
 [  0 129]]

None
