In [151]:
import pandas as pd
import numpy as np
import os
import typing
import matplotlib.pyplot as plt

from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.losses import MeanSquaredError
from keras.optimizers import Adam

In [152]:
DATA_PATH = 'data_cars/'
# all_files = os.listdir(DATA_PATH)
all_files = ['K120.csv', 'K140.csv', 'K159.csv', 'K405.csv', 'K406.csv', 'K701.csv', 'K703.csv', 'K709.csv']
print(all_files)

['K120.csv', 'K140.csv', 'K159.csv', 'K405.csv', 'K406.csv', 'K701.csv', 'K703.csv', 'K709.csv']


In [153]:
all_dataframes = []
for index, file in enumerate(all_files):
    print(f"Reading file: {file}")
    file_name = file.split('.')[0]
    df = pd.read_csv(DATA_PATH + file, sep=';')

    df['date'] = pd.to_datetime(df[file_name], format='%Y-%m-%d %H:%M')
    df = df.drop(columns=[file_name])

    df = df.set_index('date')
    df.columns = [f"{file_name}_{col}" for col in df.columns if col != 'date']
    all_dataframes.append(df)
    # print(f"Finished reading file: {file}, shape = {df.shape}")

combined_df = pd.concat(all_dataframes, axis=1)
# combined_df.fillna(method='ffill', inplace=True)
combined_df.interpolate(method='linear', inplace=True, limit=3)
combined_df['hour'] = combined_df.index.hour
combined_df['day_of_week'] = combined_df.index.dayofweek

# combined_df = combined_df[:]
print(combined_df)
print(combined_df.columns.size)
print(combined_df.isnull().sum().sum())

data = np.array(combined_df, dtype=float)[:, :]
# data = data[:,2]
scaler = StandardScaler()
# scaler = MinMaxScaler(feature_range=(0, 1))
#Don't transform the time labels -> this way the scaler also works inversely on prediction data because shapes are different otherwise
data = scaler.fit_transform(data)
# data = scaler.fit_transform(data.reshape(-1, 1))




Reading file: K120.csv
Reading file: K140.csv
Reading file: K159.csv
Reading file: K405.csv
Reading file: K406.csv
Reading file: K701.csv
Reading file: K703.csv
Reading file: K709.csv
                     K120_022  K120_023  K120_051  K120_081  K120_111   
date                                                                    
2019-11-01 00:00:00       3.0       3.0       9.0       5.0      19.0  \
2019-11-01 00:15:00       4.0       5.0       6.0       6.0      10.0   
2019-11-01 00:30:00       0.0       3.0       1.0       2.0      15.0   
2019-11-01 00:45:00       0.0       4.0       5.0       4.0       6.0   
2019-11-01 01:00:00       1.0       1.0       2.0       6.0      13.0   
...                       ...       ...       ...       ...       ...   
2019-11-30 22:45:00       6.0       9.0      27.0      15.0      60.0   
2019-11-30 23:00:00      12.0      11.0      27.0       8.0      49.0   
2019-11-30 23:15:00       4.0       6.0      21.0       6.0      57.0   
2019-11-30 23

In [154]:
timestamps = combined_df.index

In [155]:
dataset = combined_df.to_numpy()

In [156]:
def splitSequence(seq, n_steps):

    #Declare X and y as empty list
    X = []
    y = []

    for i in range(len(seq)):
        #get the last index
        lastIndex = i + n_steps

        #if lastIndex is greater than length of sequence then break
        if lastIndex > len(seq) - 1:
            break

        # Create input and output sequence
        # Last 2 columns are time of day and day of week
        seq_X, seq_y = seq[i:lastIndex], seq[lastIndex]

        #append seq_X, seq_y in X and y list
        X.append(seq_X)
        y.append(seq_y)
        #Convert X and y into numpy array
    X = np.array(X)
    y = np.array(y)

    return X,y

In [157]:
num_of_steps = data.shape[0]
train_size = 0.6
val_size = 0.15
shuffle = True
look_back = 80

num_train = int(num_of_steps * train_size)
num_val = int(num_of_steps * val_size)

# train = data[:num_train]
# val = data[num_train:num_train + num_val]
# test = data[num_train + num_val:]

x, y = splitSequence(data, look_back)

if shuffle:
    idx = np.random.permutation(len(x))
    x,y = x[idx], y[idx]



x_train, y_train = x[:num_train], y[:num_train]
x_val, y_val = x[num_train:num_train + num_val], y[num_train:num_train + num_val]
x_test, y_test = x[num_train + num_val:], y[num_train + num_val:]

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(1728, 80, 1)
(432, 80, 1)
(640, 80, 1)


In [158]:
removeData = False
removeAmount = 0.6

if removeData:
    print(x_train.shape)
    print(y_train.shape)

    print(x_train[0][0])
    #Replace random values by 0
    for i in range(int(x_train.shape[0] * removeAmount)):
        x_train[i] = np.zeros(x_train[i].shape)
        y_train[i] = np.zeros(y_train[i].shape)
        # print(x_train[i].shape)

    print(x_train[0][0])

    if shuffle:
        idx = np.random.permutation(len(x))
        x,y = x[idx], y[idx]


In [159]:
rmses = []
val_rmses = []
trainscores = []
testscores = []
runs = 10
epochs = 500


for i in range(runs):
    #Add params to do optimizing at the top
    # input_dim = 1
    input_dim = data.shape[1]
    units = 60
    # output_size = 1
    output_size = y_train.shape[1]

    input = keras.Input((look_back, input_dim))
    #return sequences is necessary for sequential LSTM layers
    lstm1 = LSTM(units, return_sequences=True)(input)
    lstm2 = LSTM(units)(lstm1)
    out = Dense(output_size)(lstm2)
    model = keras.models.Model(inputs=input, outputs=out)
    model.summary()

    model.compile(
        loss=MeanSquaredError(),
        optimizer=Adam(learning_rate=0.001),
        metrics=[keras.metrics.RootMeanSquaredError()],
    )

    cback = [keras.callbacks.EarlyStopping(patience=5)]
    # if runs == 1:
    #     cback = [keras.callbacks.EarlyStopping(patience=10)]


    history = model.fit(
        x=x_train,
        y=y_train,
        validation_data=(x_val, y_val),
        epochs=epochs,
        #makes the training stop early if it notices no improvements on the validation set 10 times in a row, to prevent overfitting
        callbacks=cback,
    )

    # save data to calculate the learning curve
    rmses.append(history.history['root_mean_squared_error'])
    val_rmses.append(history.history['val_root_mean_squared_error'])

    # make predictions
    trainPredict = model.predict(x_train)
    testPredict = model.predict(x_test)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform(y_train)
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform(y_test)
    # calculate root mean squared error
    trainScore = np.sqrt(mean_squared_error(trainY, trainPredict))
    print(f'Train Score: {trainScore:.2f} RMSE')
    testScore = np.sqrt(mean_squared_error(testY, testPredict))
    print(f'Test Score: {testScore:.2f} RMSE')
    trainscores.append(trainScore)
    testscores.append(testScore)

for i in range(runs):
    print(f'Run {i+1}:')
    print(f'  Train Score: {trainscores[i]:.2f} RMSE')
    print(f'  Test Score: {testscores[i]:.2f} RMSE')

Model: "model_58"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_59 (InputLayer)       [(None, 80, 1)]           0         
                                                                 
 lstm_116 (LSTM)             (None, 80, 60)            14880     
                                                                 
 lstm_117 (LSTM)             (None, 60)                29040     
                                                                 
 dense_58 (Dense)            (None, 1)                 61        
                                                                 
Total params: 43,981
Trainable params: 43,981
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500


In [160]:
# rmses = scaler.inverse_transform(rmses)
# val_rmses = scaler.inverse_transform(val_rmses)

print("trainscores: " + str(trainscores))
print("testscores: " + str(testscores))

rmses = np.matrix(rmses)
val_rmses = np.matrix(val_rmses)

print(rmses.shape)

rmse_avg = np.mean(rmses, axis=0).transpose()
val_rmse_avg = np.mean(val_rmses, axis=0).transpose()

print(rmse_avg.shape)

rmse_std = np.std(rmses, axis=0).transpose()
val_rmse_std = np.std(val_rmses, axis=0).transpose()

sigma = 1
skip = 3

rmse_std_high = rmse_avg + rmse_std * sigma
rmse_std_low = rmse_avg - rmse_std * sigma
val_rmse_std_high = val_rmse_avg + val_rmse_std * sigma
val_rmse_std_low = val_rmse_avg - val_rmse_std * sigma


plt.plot(rmse_avg[skip:], label='train', color='orange')
plt.plot(val_rmse_avg[skip:], label='validation', color='green')
plt.plot(rmse_std_high[skip:], label='train std', linestyle='dashed', color='orange')
plt.plot(rmse_std_low[skip:], label='_nolegend_', linestyle='dashed', color='orange')
plt.plot(val_rmse_std_high[skip:], label='validation std', linestyle='dashed', color='green')
plt.plot(val_rmse_std_low[skip:], label='_nolegend_', linestyle='dashed', color='green')
plt.title("learning curve")
plt.xlabel('epoch')
plt.ylabel('loss (RMSE)')
plt.legend(['train', 'validation', 'train_std', 'validation_std'], loc='upper right')
plt.show()


trainscores: [5.750163067039128, 5.823281466985095, 5.808611991713464, 6.095367730070231, 5.90805554537215, 5.623101054294955, 5.901341613796622, 5.813693011400218, 5.797063243798349, 5.834219990716134]
testscores: [6.4555772829633895, 6.569485446647466, 6.45978205721166, 6.622722594013811, 6.531599712040477, 6.458144069506753, 6.611279765145696, 6.5062721229630744, 6.524153557037169, 6.469869516070491]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [None]:
print(np.mean([8.14,7.87,7.86,7.96,7.93]))
