In [27]:
# ---- LIBRARIES -----
import os
import csv
import json
import cryptocompare
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

with open(os.getcwd()+'\\parameters\\Parameters.json', 'r') as json_file:
    Parameters = json.load(json_file)

In [28]:
class Dataset():

    def __init__(self, Parameters):
        self.mesh = Parameters['Mesh']
        self.hist = pd.DataFrame([])
        self.hist_path = Parameters['hist_path']
        self.ML_dataset_path = Parameters['ML_dataset_path']
        self.trend_length = Parameters['trend_length']
        self.ML_trend_length = Parameters['ML_trend_length']
        self.parameters = Parameters
        self.date_name = ''
        self.companies_list_path = Parameters['Companies_list_path']
        self.companies_list = pd.read_csv(os.getcwd() +Parameters['Companies_list_path'])['Companies'].to_list()
        self.study_length = Parameters['study_length']
        self.LSTM_model_path = Parameters['LSTM_model_path']

    def load(self):
        
        if self.mesh == '1m':
            self.path = os.getcwd() + '\\resources\\full_NASDAQ_history_1m.csv'
            self.date_name = 'Datetime'
            self.hist = pd.read_csv(self.path, usecols=['Close', 'Company', 'Datetime', 'Dividends', 'High', 'Low', 'Open', 'Stock Splits', 'Volume'])

        else:
            self.path = os.getcwd() + self.hist_path
            self.date_name = 'Date'
            self.hist = pd.read_csv(self.path, usecols=['Close', 'Company', 'Date', 'Dividends', 'High', 'Low', 'Open', 'Stock Splits', 'Volume'])
        
        # Date formating
        return(self.hist)

    def update(self, update_period='max'):
        # Initialization
        list_of_df_to_merge = [self.hist]

        for company in self.companies_list:
            # Get history Ignore timezone
            hist = yf.Ticker(company).history(period=update_period, interval=self.mesh)

            if hist.empty != True:
                # Reset index, add company name and format date
                hist = hist.reset_index()
                hist['Company'] = [company]*len(hist)

                if self.date_name == '1m':                
                    hist[self.date_name] = hist[self.date_name].dt.strftime('%Y-%m-%d %H:%M:%S')
                    hist[self.date_name] = hist[self.date_name].dt.floor('min')
                else:
                    hist[self.date_name] = hist[self.date_name].astype('datetime64[ns]')
                
                hist[self.date_name] = pd.to_datetime(hist[self.date_name])
                
                # Add hist to the list of dict to merge
                list_of_df_to_merge.append(hist)

            else:
                # We remove companies whose data was not available
                self.companies_list.remove(company)

        # Concat and remove duplicates
        new_hist = pd.concat(list_of_df_to_merge)[self.hist.columns]
        new_hist = new_hist.drop_duplicates(subset=[self.date_name, 'Company'], keep='last')

        # reset index for the new dataframe
        new_hist = new_hist.reset_index(drop=True)
        new_hist

        # Update Company list
        with open(os.getcwd() + self.companies_list_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(zip(['Companies']+self.companies_list))

        self.hist = new_hist
        self.save()

        return(self.hist)

    def update_crypto(self, cryptoname):
        # Fetch info
        btc_hist = cryptocompare.get_historical_price_day(cryptoname, curr='USD', limit=2000)

        # Load
        df_hist = pd.DataFrame(btc_hist)

        # Time format
        df_hist['time'] = pd.to_datetime(df_hist['time'], unit='s')

        # Rename
        df_hist = df_hist.rename(columns={"time": "Date", "open": "Open", "high": "High", "low": "Low", "close": "Close", "volumeto": "Volume"})

        # Reorder columns
        df_hist = df_hist[['Date','Open','High','Low','Close','Volume']]

        # Completion with fake values
        df_hist['Dividends'] = len(df_hist) * [0]
        df_hist['Stock Splits'] = len(df_hist) * [0]
        df_hist['Company'] = len(df_hist) * [cryptoname]

        # Concat and remove duplicates
        new_hist = pd.concat([self.hist, df_hist])[self.hist.columns]
        new_hist = new_hist.drop_duplicates(subset=[self.date_name, 'Company'], keep='last')

        # reset index for the new dataframe
        new_hist = new_hist.reset_index(drop=True)
        new_hist

        # Update Company list
        with open(os.getcwd() + self.companies_list_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(zip(['Companies']+self.companies_list))

        self.hist = new_hist
        self.save()

        return(self.hist)

    def save(self):
        self.hist.to_csv(self.path)

    def new_format(self, study_length):
        # TCD to set date in columns, have a sum of the companies
        TCD = pd.pivot_table(self.hist, 'Open', index=['Company'], columns=[self.date_name], aggfunc=np.sum, margins=True, margins_name='NASDAQ').fillna(method='ffill', axis=1)
        
        # Keeping only the NASDAQ row
        TCD = TCD.drop(columns=['NASDAQ'])

        # Sorting columns
        TCD = TCD.reindex(TCD.columns.tolist().sort(), axis=1)
        
        # Replacing remaining NaN by 0 
        #TCD.fillna(0)

        # Reshaping
        TCD.columns.name = None
        TCD = TCD.reset_index().rename_axis(None, axis=1).set_index('Company')

        # Resizing
        dataset = TCD[TCD.columns[-study_length:]]

        return(dataset)

    def create_ML_dataset(self, dataset):
        datasets_list = []
        # Reducing the dataset only to the companies in the list
        dataset = dataset[dataset.index.isin(self.companies_list)]
        dataset = dataset.reset_index()

        # Dataset enrichment
        #supplement_df = self.enrich_symbol(['sector', 'country', 'shortName'])
        #supplement_df = supplement_df.reset_index()

        # Columns creation
        columns = ['Company']
        for i in range(self.ML_trend_length):
            columns.append('Day_'+str(i+1))
        columns.append('prediction')

        # Let's not take the period we study for training :)
        for day in range(self.ML_trend_length+1, len(dataset.columns.to_list())-self.study_length):
            # Reinitialization
            BS_dict_list = []
            prediction_dict_list = []

            # Reducing the dataset to the trend period studied and the companies in the companies list TODO add string integration for ML 
            small_dataset = dataset[['Company']+dataset.columns[day-self.ML_trend_length:day+1].to_list()]
            #small_dataset = dataset[+dataset.columns[day-self.ML_trend_length:day+1].to_list()]

            # Rename columns
            small_dataset.columns = columns

            # One Hot Encoding to add companies as feature
            Company_features = pd.get_dummies(small_dataset.Company, prefix='Company')
            small_dataset = pd.concat([Company_features, small_dataset], axis=1)

            datasets_list.append(small_dataset)

        # Add columns TODO add string integration for ML
        #columns += ['sector', 'country', 'shortName']

        # Enrich the data frame with feature
        ML_dataset = pd.concat(datasets_list).dropna()
        #ML_dataset = ML_dataset.join(supplement_df.set_index('index'), on='Company')
        
        #Remove the 'Company Column'
        ML_dataset.pop('Company')
        
        # Keep all Company columns out to avoid losing the 1
        temp_ML_Dataset = ML_dataset[ML_dataset.columns.to_list()[0:len(self.companies_list)]].copy()
        ML_dataset = ML_dataset.drop(ML_dataset.columns.to_list()[0:len(self.companies_list)], axis=1)
        # Remove row with a 0 as value
        ML_dataset = ML_dataset.loc[ML_dataset.Day_1 > 0]
        # Normalize lines
        ML_dataset = ML_dataset.div(ML_dataset.max(axis=1)*2, axis=0)
        # Concatenate it again
        ML_dataset = pd.concat([temp_ML_Dataset, ML_dataset], axis=1).dropna()
        
        # normalize the dataset
        #scaler = MinMaxScaler(feature_range=(0, 1))
        #dataset = scaler.fit_transform(dataset)
        
        # Save dataframe 
        ML_dataset.to_csv(os.getcwd() + self.ML_dataset_path)

        return(ML_dataset)
    

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return(np.array(dataX), np.array(dataY))

In [32]:
# Test lines, executed only when the file is executed as main
full_hist = Dataset(Parameters)
full_hist.load()

if full_hist.date_name != '1m':
    full_hist.hist[full_hist.date_name] = full_hist.hist[full_hist.date_name].astype('datetime64[ns]')
else:
    full_hist.hist[full_hist.date_name] = full_hist.hist[full_hist.date_name].dt.floor('min')

full_hist.update_crypto('ETH')
#full_hist.save()
#print(full_hist.hist)
print(len(full_hist.hist))
dataset = full_hist.new_format(1000)

195956


In [34]:
# Reducing the dataset only to the companies in the list
dataset = dataset[dataset.index.isin(full_hist.companies_list)]

for company in full_hist.companies_list:
    values_list = dataset.loc[company,:].values.astype('float32').reshape(-1, 1)
    
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    values_list = scaler.fit_transform(values_list)
    
    # split into train and test sets
    train_size = int(len(values_list) * 0.67)
    test_size = len(values_list) - train_size
    train, test = values_list[0:train_size,:], values_list[train_size:len(values_list),:]
    #print(len(train), len(test))
    
    # reshape into X=t and Y=t+1
    look_back = 1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    print(testX)
    
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    print(testX.shape)
    
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss=tf.losses.MeanSquaredError(), optimizer='adam', metrics=[tf.metrics.MeanAbsoluteError()])
    model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2)

    # Save the model
    model.save(os.getcwd()+full_hist.LSTM_model_path+'_'+str(company))
    
    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    print(str(company)+ '_Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    print(str(company)+ '_Test Score: %.2f RMSE' % (testScore))

[[0.41978145]
 [0.4309855 ]
 [0.4373361 ]
 [0.4276533 ]
 [0.4276533 ]
 [0.4276533 ]
 [0.37607622]
 [0.38498628]
 [0.3501668 ]
 [0.33705533]
 [0.27948982]
 [0.27948982]
 [0.27948982]
 [0.33990455]
 [0.39155412]
 [0.3740962 ]
 [0.37187457]
 [0.33922845]
 [0.33922845]
 [0.33922845]
 [0.2951609 ]
 [0.32749325]
 [0.32809693]
 [0.27630246]
 [0.29791367]
 [0.29791367]
 [0.29791367]
 [0.2425214 ]
 [0.25594687]
 [0.23725748]
 [0.25565714]
 [0.25515002]
 [0.25515002]
 [0.25515002]
 [0.20903009]
 [0.22902346]
 [0.2637704 ]
 [0.25355637]
 [0.2685997 ]
 [0.2685997 ]
 [0.2685997 ]
 [0.2637462 ]
 [0.27548146]
 [0.25350803]
 [0.23863375]
 [0.24457389]
 [0.24457389]
 [0.24457389]
 [0.2641325 ]
 [0.31218433]
 [0.2927221 ]
 [0.3071136 ]
 [0.3071136 ]
 [0.3071136 ]
 [0.3071136 ]
 [0.30617172]
 [0.33439904]
 [0.34019434]
 [0.35221928]
 [0.34572393]
 [0.34572393]
 [0.34572393]
 [0.32944906]
 [0.32541662]
 [0.31896943]
 [0.32442665]
 [0.32763815]
 [0.32763815]
 [0.32763815]
 [0.33874553]
 [0.34666544]
 [0.34

Epoch 50/50
668/668 - 1s - loss: 5.5593e-05 - mean_absolute_error: 0.0053




INFO:tensorflow:Assets written to: C:\Users\louis.poux\OneDrive - BearingPoint GmbH\5 - Python\Codes\Lupin2_perso\models\LSTM_AAPL\assets


INFO:tensorflow:Assets written to: C:\Users\louis.poux\OneDrive - BearingPoint GmbH\5 - Python\Codes\Lupin2_perso\models\LSTM_AAPL\assets


NameError: name 'math' is not defined