In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [2]:
history_days = 50

In [50]:
def csv_to_dataset(csv_file_path):
    file_data = pd.read_csv(csv_file_path)
    file_data = file_data.iloc[::-1] # Reverse order, most recent values last, want to be predicting into future, not past
    file_data = file_data.drop('date', axis=1)
    file_data = file_data.drop(0, axis=0)
    print("File data DataFrame:", file_data.shape)
    print(file_data.head())
    file_data = file_data.values
    
    normalizing_scaler = preprocessing.MinMaxScaler()
    normalized_data = normalizing_scaler.fit_transform(file_data)
    print()
    print("Normalized data")
    print(normalized_data[0:5,:])
    
    # Data is in order of: Open stock value, high value, low, close, and volume - ohlcv
    # Creates array of 5x50-value array windows, each one will be a training input into model
    ohlcv_histories_normalised = np.array([normalized_data[i : i + history_days].copy() for i in range(len(normalized_data) - history_days)])
    print()
    print("Normalized inputs")
    print(ohlcv_histories_normalised.shape)
    print(ohlcv_histories_normalised[0:2,0:5])
    
    # Get scaled stock open price values, which model is predicting
    next_day_open_values_normalised = np.array([normalized_data[:,0][i + history_days].copy() for i in range(len(normalized_data) - history_days)])
    next_day_open_values_normalised = np.expand_dims(next_day_open_values_normalised, -1)
    print()
    print("Next day open values scaled:", next_day_open_values_normalised.shape)
    
    # Get unscaled stock open price from original file data
    next_day_open_values = np.array([file_data[:,0][i + history_days].copy() for i in range(len(file_data) - history_days)])
    next_day_open_values = np.expand_dims(next_day_open_values, -1)
    print("Next day open values unscaled:", next_day_open_values.shape)

    y_normaliser = preprocessing.MinMaxScaler()
    y_normaliser.fit(next_day_open_values)
    
    assert ohlcv_histories_normalised.shape[0] == next_day_open_values_normalised.shape[0]
    return ohlcv_histories_normalised, next_day_open_values_normalised, next_day_open_values, y_normaliser

In [54]:
ohlcv_histories_scaled, next_day_open_values_scaled, open_values_unscaled, y_normaliser = csv_to_dataset('./MSFT_daily.csv')

(5254, 5)
   1. open  2. high  3. low  4. close   5. volume
1   200.05   204.33  199.96    202.91  34011257.0
2   210.62   210.65  204.64    205.05  26372464.0
3   208.42   209.78  206.93    208.78  21823942.0
4   204.24   209.20  204.03    205.41  30375768.0
5   207.20   208.63  201.24    204.03  33620073.0

Normalized data
[[0.86350259 0.86867059 0.87109375 0.86725173 0.04812097]
 [0.91287896 0.89776284 0.89311935 0.87713626 0.03506769]
 [0.90260195 0.89375806 0.90389684 0.8943649  0.02729512]
 [0.88307563 0.8910882  0.89024849 0.87879908 0.04190859]
 [0.89690288 0.88846437 0.87711785 0.87242494 0.04745251]]

Normalized inputs
(5204, 50, 5)
[[[0.86350259 0.86867059 0.87109375 0.86725173 0.04812097]
  [0.91287896 0.89776284 0.89311935 0.87713626 0.03506769]
  [0.90260195 0.89375806 0.90389684 0.8943649  0.02729512]
  [0.88307563 0.8910882  0.89024849 0.87879908 0.04190859]
  [0.89690288 0.88846437 0.87711785 0.87242494 0.04745251]]

 [[0.91287896 0.89776284 0.89311935 0.87713626 0.035