In [1]:
# knjiznice
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers import Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
import matplotlib.pyplot as plt

In [2]:
# spremenljivke za testiranje
st_izbranih_znacilnic = 10
st_zaporednih = 504
st_napovedi = 1

In [3]:
# branje podatkov iz datoteke
df = pd.read_excel('iir_podatkovna_zbirka.xlsx')

In [4]:
# pretvorimo time v datetime, sort by time da imamo zaporedje in drop time da ne ovira
df['time'] = pd.to_datetime(df['time'])
df.sort_values(by='time', inplace=True)
df = df.drop(['time'], axis=1)

In [5]:
# zapolnimo missing data 
def zapolni_podatke(df):
    imp = IterativeImputer()
    imp.fit(df)
    temp_df = imp.transform(df)
    df = pd.DataFrame(temp_df, columns=df.columns)
    return df

In [6]:
df = zapolni_podatke(df)
df

Unnamed: 0.1,Unnamed: 0,global energy,diffusive energy,mean T,min T,max T,T,precipitation,mean pressure,min pressure,max pressure,mean rel. hum.,min rel. hum.,max rel. hum.,rel. hum.,wind speed,wind direction,max gust
0,288911.0,0.000000,0.000000,-0.6,-0.7,-0.5,-0.7,0.000000,948.0,948.0,948.0,34.0,34.0,35.0,35.0,2.6,6.0,2.9
1,125241.0,47.814247,-3.283533,-0.7,-0.8,-0.6,-0.8,-0.231041,948.0,948.0,948.0,35.0,34.0,35.0,35.0,2.6,8.0,2.9
2,15657.0,53.940140,0.228771,-1.0,-1.0,-0.8,-1.0,-0.214562,948.0,948.0,948.0,36.0,35.0,36.0,36.0,2.4,14.0,2.7
3,66656.0,0.000000,0.000000,-0.8,-1.0,-0.7,-0.7,0.000000,948.0,948.0,948.0,35.0,35.0,36.0,35.0,2.0,24.0,2.4
4,182713.0,113.510726,13.932471,-0.7,-0.7,-0.6,-0.7,-0.235367,948.0,948.0,948.0,34.0,33.0,35.0,33.0,1.9,23.0,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311035,3154.0,-57.196301,6.752438,-0.1,-0.1,-0.1,-0.1,0.104461,941.0,941.0,941.0,97.0,96.0,98.0,97.0,5.3,29.0,7.8
311036,109931.0,-71.929606,0.029726,-0.1,-0.2,-0.1,-0.2,0.116828,941.0,941.0,941.0,97.0,97.0,98.0,97.0,5.1,25.0,7.6
311037,31904.0,0.000000,0.000000,-0.2,-0.2,-0.2,-0.2,0.200000,941.0,941.0,941.0,97.0,96.0,98.0,97.0,5.7,29.0,8.0
311038,154759.0,-59.744355,7.282170,-0.2,-0.2,-0.2,-0.2,0.121172,941.0,941.0,941.0,97.0,96.0,98.0,97.0,5.1,29.0,7.8


In [7]:
df.isnull().sum()

Unnamed: 0          0
global energy       0
diffusive energy    0
mean T              0
min T               0
max T               0
T                   0
precipitation       0
mean pressure       0
min pressure        0
max pressure        0
mean rel. hum.      0
min rel. hum.       0
max rel. hum.       0
rel. hum.           0
wind speed          0
wind direction      0
max gust            0
dtype: int64

In [8]:
# locimo na podatke in vrednosti za napoved
X = df.drop(['global energy'], axis=1)
Y = df['global energy'].to_numpy()

In [9]:
def izbira_znacilnic(X,Y,st_izbranih_znacilnic):
    # izracunamo katere znacilnice vzeti
    mutual_info = mutual_info_regression(X, Y)
    izbrane = X.columns[np.argsort(mutual_info)[-st_izbranih_znacilnic:]]
    return izbrane

In [10]:
izbira = izbira_znacilnic(X,Y,st_izbranih_znacilnic)
print(izbira)
X = X[izbira]

Index(['min T', 'mean T', 'T', 'max T', 'max rel. hum.', 'mean rel. hum.',
       'rel. hum.', 'min rel. hum.', 'precipitation', 'diffusive energy'],
      dtype='object')


In [11]:
# split data na train in test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=4032, shuffle=False)

In [15]:
X_train

array([[-1.29993339, -1.29920453, -1.31117411, ..., -2.08988789,
        -0.23980037, -0.87063837],
       [-1.31203589, -1.31125567, -1.32322445, ..., -2.08988789,
        -1.0109885 , -0.92202998],
       [-1.33624087, -1.34740908, -1.34732513, ..., -2.03655648,
        -0.9559821 , -0.8670578 ],
       ...,
       [ 0.26128832,  0.25539215,  0.25537026, ...,  0.57668249,
        -0.23980037, -0.87063837],
       [ 0.27339081,  0.25539215,  0.25537026, ...,  0.6300139 ,
        -0.05199383, -0.45998183],
       [ 0.26128832,  0.25539215,  0.24331992, ...,  0.6833453 ,
        -0.03982277, -0.5962654 ]])

In [16]:
X_test

array([[ 0.24918583,  0.24334101,  0.23126958, ...,  0.6833453 ,
        -0.23980037, -0.87063837],
       [ 0.24918583,  0.23128988,  0.23126958, ...,  0.6833453 ,
         0.02554322, -0.40651853],
       [ 0.24918583,  0.24334101,  0.24331992, ...,  0.6833453 ,
         0.13107045, -0.48160751],
       ...,
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.42777528, -0.87063837],
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.16465647, -0.75666285],
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.18945283, -0.75189928]])

In [12]:
def standardizacija_podatkov(data1, data2):
    scaler = StandardScaler()
    return scaler.fit_transform(data1), scaler.transform(data2)

In [13]:
X_train, X_test = standardizacija_podatkov(X_train, X_test)

In [17]:
X_train

array([[-1.29993339, -1.29920453, -1.31117411, ..., -2.08988789,
        -0.23980037, -0.87063837],
       [-1.31203589, -1.31125567, -1.32322445, ..., -2.08988789,
        -1.0109885 , -0.92202998],
       [-1.33624087, -1.34740908, -1.34732513, ..., -2.03655648,
        -0.9559821 , -0.8670578 ],
       ...,
       [ 0.26128832,  0.25539215,  0.25537026, ...,  0.57668249,
        -0.23980037, -0.87063837],
       [ 0.27339081,  0.25539215,  0.25537026, ...,  0.6300139 ,
        -0.05199383, -0.45998183],
       [ 0.26128832,  0.25539215,  0.24331992, ...,  0.6833453 ,
        -0.03982277, -0.5962654 ]])

In [18]:
X_test

array([[ 0.24918583,  0.24334101,  0.23126958, ...,  0.6833453 ,
        -0.23980037, -0.87063837],
       [ 0.24918583,  0.23128988,  0.23126958, ...,  0.6833453 ,
         0.02554322, -0.40651853],
       [ 0.24918583,  0.24334101,  0.24331992, ...,  0.6833453 ,
         0.13107045, -0.48160751],
       ...,
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.42777528, -0.87063837],
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.16465647, -0.75666285],
       [-1.23942092, -1.25099998, -1.2509224 , ...,  1.21665938,
         0.18945283, -0.75189928]])

In [19]:
X_train.shape

(307008, 10)

In [20]:
Y_train.shape

(307008,)

In [21]:
def pripravi_timeseries(X, Y, st_zaporednih, st_napovedi):
    dolzina = len(X) # x_train ali x_test

    temp_X = []
    temp_Y = []

    for i in range(st_zaporednih, dolzina - st_napovedi + 1):
        temp_X.append(X[i - st_zaporednih:i, :])
        temp_Y.append(Y[i + st_napovedi - 1:i + st_napovedi])

    return np.array(temp_X), np.array(temp_Y)


In [22]:
X_train, Y_train = pripravi_timeseries(X_train, Y_train, st_zaporednih, st_napovedi)
X_test, Y_test = pripravi_timeseries(X_test, Y_test, st_zaporednih, st_napovedi)

In [23]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(306504, 504, 10)
(306504, 1)
(3528, 504, 10)
(3528, 1)


In [24]:
# pripravimo model
model = Sequential()
model.add(LSTM(128, input_shape=(st_zaporednih, st_izbranih_znacilnic), return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(Y_train.shape[1]))

cp = ModelCheckpoint('model/', save_best_only=True)
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 504, 128)          71168     
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 120,641
Trainable params: 120,641
Non-trainable params: 0
_________________________________________________________________


In [None]:
# fit model nad pripravljenimi podatki
model.fit(X_train, Y_train, epochs=20, steps_per_epoch=500, validation_data=(X_test, Y_test), batch_size=64,callbacks=[cp])

# si nalozimo najboljsi model ki je bil v 20 epochs
model = load_model('model/')

# si shranimo model v datoteko da ga lahko kasneje nalozimo in uporabimo
model.save('model.h5')

In [None]:
# validacija modela
pred = model.predict(X_test)
res = model.evaluate(X_test, Y_test, verbose=0)
print("test loss: ")
print(res)

# povprečna absolutna napaka
pan = mean_absolute_error(Y_test, pred)
print("povprecna abs napaka")
print(pan)

# povprečna kvadratna napaka
pkn = mean_squared_error(Y_test, pred)
print("povprecna kvadratna napaka")
print(pkn)

# vrednost razložene variance
vrv = explained_variance_score(Y_test, pred)
print("vrednost razlozene variance")
print(vrv)

# prikaz grafa
plt.plot(Y_test, label='realne vrednosti')
plt.plot(pred, label='napovedi')
plt.legend()
plt.show()
