In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, InputLayer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_tweedie_deviance
from sklearn.metrics import mean_squared_log_error

import statsmodels.api as sm

In [15]:
data = pd.read_csv('../NormalizedWeatherDataS.csv', sep=';')
data.describe()


Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude,ID OMM station
count,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0
mean,181.758325,0.540945,0.151609,0.727043,0.480349,0.158559,0.014287,0.489141,0.541388,0.492306,0.204036,7379.135497
std,105.30195,0.338915,0.100956,0.195087,0.098096,0.101743,0.039848,0.146888,0.293842,0.24028,0.22657,233.636534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7020.0
25%,90.375,0.25,0.08155,0.59596,0.42278,0.08403,0.00087,0.38674,0.25513,0.31747,0.04813,7168.0
50%,181.25,0.5,0.12876,0.76768,0.47876,0.13725,0.00087,0.4825,0.59892,0.48728,0.13237,7335.0
75%,273.125,0.875,0.20172,0.88889,0.53475,0.21008,0.00606,0.58564,0.83592,0.68286,0.28039,7607.0
max,364.875,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7790.0


In [16]:
quadrillage = []
quadri = []
p = 0
q = 0
for i in range(0, 4):
    for j in range(0, 4):
        quadrillage.append([p, p+0.25, q, q+0.25])
        p = p + 0.25
    p = 0
    q = q + 0.25
# round to 2 decimals
quadrillage = np.around(quadrillage, decimals=3)
print(quadrillage)


for (i, j, k, l) in quadrillage:
    quadri.append(data.loc[(data['Latitude'] >= i) & (data['Latitude'] < j) & (data['Longitude'] >= k) & (data['Longitude'] < l)])

print(len(quadri))
#remove empty dataframes
quadri = [x for x in quadri if not x.empty]
print(len(quadri))

#remove dataframes with less than 500 rows
quadri = [x for x in quadri if len(x) > 500]
print(len(quadri))

#fix indexes
for i in range(0, len(quadri)):
    quadri[i] = quadri[i].reset_index(drop=True)

#print all shapes
for i in range(0, len(quadri)):
    # print(quadri[i].describe())
    pass
    
print(quadri[0].head())

[[0.   0.25 0.   0.25]
 [0.25 0.5  0.   0.25]
 [0.5  0.75 0.   0.25]
 [0.75 1.   0.   0.25]
 [0.   0.25 0.25 0.5 ]
 [0.25 0.5  0.25 0.5 ]
 [0.5  0.75 0.25 0.5 ]
 [0.75 1.   0.25 0.5 ]
 [0.   0.25 0.5  0.75]
 [0.25 0.5  0.5  0.75]
 [0.5  0.75 0.5  0.75]
 [0.75 1.   0.5  0.75]
 [0.   0.25 0.75 1.  ]
 [0.25 0.5  0.75 1.  ]
 [0.5  0.75 0.75 1.  ]
 [0.75 1.   0.75 1.  ]]
16
13
13
    Date  Type de tendance barométrique  Vitesse du vent moyen 10 mn  \
0  0.000                          1.000                      0.15451   
1  0.125                          0.750                      0.08155   
2  0.250                          0.750                      0.13305   
3  0.250                          0.750                      0.19313   
4  0.375                          0.375                      0.16738   

   Humidité  Variation de pression en 24 heures  \
0   0.97980                             0.55598   
1   0.95960                             0.53475   
2   0.97980                         

In [17]:
#count the number of rows in each dataframe
for i in range(0, len(quadri)):
    print(len(quadri[i]))

5844
6827
8325
7937
11170
11389
7639
14065
2809
8477
5704
2806
2886


In [18]:
allstats = []
#count the stations per dataframe
for i in range(0, len(quadri)):
    stats = (quadri[i]['ID OMM station'].nunique())
    allstats.append(stats)
    #count max consecutive date duplicates
    dups = (quadri[i].groupby('Date').size().max())
    # pr
    # int(stats)
    if(dups > stats):
        print("Error : too much duplicates")

#predictions

In [42]:
#prepare data for LSTM : 5 consectutive dates * the number of stations to predict the next

def decimal(x):
    return x - int(x)

def df_to_X_y_consec(df, window_size=5, hour=0):
  #assuming data is sorted by station then by date
  #todo convert hour to data format or vice versa
  X = []
  y = []
  j =0
  # process : find the first date with the hour, then take the preceding <window_size> dates, repeat
  y_found = False
  for index, row in df.iterrows():
    if(decimal(row['Date']) == hour):
     
        y.append(row['Température (°C)'])
        for i in range(1, window_size+1):
            X.append(df.iloc[index-i].values)

   

  return np.array(X), np.array(y)
#todo manage previous days

# def df_to_X_y(df, window_size=5):
#   df_as_np = df.to_numpy()
#   X = []
#   y = []
#   for i in range(len(df_as_np)-window_size):
#     row = [[a] for a in df_as_np[i:i+window_size]]
#     X.append(row)
#     label = df_as_np[i+window_size]
#     y.append(label)
#   print(X,"   ", y)
#   return np.array(X), np.array(y)


In [44]:
Y = data['Température (°C)']
X1, y1 = df_to_X_y_consec(data, 15,0)
print(X1.shape, y1.shape)
print( y1[0])
print(X1[0])

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(15, 10)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# fit model
model.fit(X1, y1, epochs=20, verbose=0)

train_predictions = model.predict(X_train1).flatten()
#     train_results = pd.DataFrame({'Actual': y_train1.flatten(), 'Predicted': train_predictions})
#     print(train_results)

#     #get r2 score
#     print('r2 score : ', r2_score(y_train1, train_predictions))
# demonstrate prediction
x_input = np.array(quadri[0].iloc[0:15].values)
x_input = x_input.reshape((1, 15, 10))
yhat = model.predict(x_input, verbose=0)
print(yhat)

#print prediction vs real




# for i in range(0, len(quadri)):

(76035, 12) (5069,)
0.40516
[8.7500e-01 7.5000e-01 4.8927e-01 8.7879e-01 3.6680e-01 4.9580e-01
 2.8570e-02 4.6961e-01 1.0000e+00 1.7789e-01 4.8100e-03 7.0200e+03]


In [None]:
 X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=0)

     model = Sequential()
    model.add(InputLayer(input_shape=(n_steps_in,1)))
    model.add(LSTM(64))
    model.add(Dense(8,'relu'))
    model.add(Dense(1,'linear'))

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    #edit adam learning rate ?
    #default adam learning rate is 0.001
    #edit adam learning rate to 0.01 ?
    

    cp  = ModelCheckpoint(filepath='model.h5', save_best_only=True, verbose=0)
    model.fit(X_train1, y_train1, epochs=15,  validation_data=(X_test1, y_test1), callbacks=[cp])
    model = keras.models.load_model('model.h5')

    train_predictions = model.predict(X_train1).flatten()
    train_results = pd.DataFrame({'Actual': y_train1.flatten(), 'Predicted': train_predictions})
    print(train_results)

    #get r2 score
    print('r2 score : ', r2_score(y_train1, train_predictions))

#     #plot predictions and actual value depending on the time

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data['Date'][:60000], y=train_results['Actual'], mode='lines', name='Actual'))
    fig.add_trace(go.Scatter(x=data['Date'][:60000], y=train_results['Predicted'], mode='lines', name='Predicted'))
    fig.update_layout(title='Actual and Predicted values', xaxis_title='Date', yaxis_title='Temperature')
    fig.show()

In [None]:
for i in range(0, len(quadri)):
    X, y = df_to_X_y_consec(quadri[i], 15,0)
    print(X.shape, y.shape)
    model.fit(X, y, epochs=20, verbose=0)

# demonstrate prediction

    x_input = np.array(quadri[i].iloc[0:15].values)
    x_input = x_input.reshape((1, 15, 10))
    yhat = model.predict(x_input, verbose=0)
    print(yhat)

    

In [39]:
#sort data by ID OMM station and Date
data = data.sort_values(by=['ID OMM station', 'Date'])
data = data.reset_index(drop=True)
(data.head(50))

Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude,ID OMM station
0,0.25,1.0,0.4721,0.82828,0.51351,0.44258,0.0026,0.46225,1.0,0.17789,0.00481,7020
1,0.375,0.375,0.46352,0.77778,0.48649,0.43137,0.00087,0.46225,1.0,0.17789,0.00481,7020
2,0.5,1.0,0.41631,0.79798,0.45367,0.40336,0.00087,0.46777,1.0,0.17789,0.00481,7020
3,0.625,0.75,0.36052,0.82828,0.42471,0.33613,0.00087,0.47698,1.0,0.17789,0.00481,7020
4,0.75,1.0,0.28326,0.89899,0.39961,0.2605,0.00606,0.46041,1.0,0.17789,0.00481,7020
5,0.875,0.75,0.48927,0.87879,0.3668,0.4958,0.02857,0.46961,1.0,0.17789,0.00481,7020
6,1.0,1.0,0.27468,0.78788,0.60039,0.2521,0.0026,0.40516,1.0,0.17789,0.00481,7020
7,1.125,0.875,0.32618,0.82828,0.55212,0.29972,0.0026,0.41436,1.0,0.17789,0.00481,7020
8,1.25,1.0,0.40773,0.83838,0.50772,0.37535,0.0026,0.41989,1.0,0.17789,0.00481,7020
9,1.5,0.25,0.38627,0.86869,0.44595,0.36695,0.0026,0.43462,1.0,0.17789,0.00481,7020


In [None]:
stations = []
nbstations = data['ID OMM station'].nunique()
data.sort_values(by=['ID OMM station'])
data.reset_index(drop=True)

cpt = -1
laststation = 0
for i in range(0, len(data)):
    if (data['ID OMM station'][i] != laststation):
        cpt = cpt + 1
        laststation = data['ID OMM station'][i]
    stations.append(data['ID OMM station'][i])


In [None]:
for index,row in quadri[0].iterrows():
   #prédiction a une heure h pour l'heure h+3
    #on prend 5 valeurs pour prédire la 6eme
    #l'heure est la partie flottante de la date
    #entrainement sur 80% des données
    #test sur 20% des données
    #5 valeurs dans le neurone d'entrée
    #1 valeur dans le neurone de sortie
    #on fait 12 epochs
    #batch size de 1
    #prédictions sur toutes les stations
    

#    #   
#     X = data.drop(['Température (°C)'], axis=1)
#     Y = data['Température (°C)']
    
#     # split into samples
#     n_steps_in= 5*allstats[j]
#     print('steps : 5*', allstats[j], ' = ', n_steps_in)

#     ###################check that there are exactly stats consecutive duplicates per date : TODO

#     X1, y1 = df_to_X_y(Y, n_steps_in)
#     print(X1.shape, y1.shape)

# # split into train and test sets
#     X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=0)

#     model = Sequential()
#     model.add(InputLayer(input_shape=(n_steps_in,1)))
#     model.add(LSTM(64))
#     model.add(Dense(8,'relu'))
#     model.add(Dense(1,'linear'))

#     model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#     #edit adam learning rate ?
#     #default adam learning rate is 0.001
#     #edit adam learning rate to 0.01 ?
    

#     cp  = ModelCheckpoint(filepath='model.h5', save_best_only=True, verbose=0)
#     model.fit(X_train1, y_train1, epochs=15,  validation_data=(X_test1, y_test1), callbacks=[cp])
#     model = keras.models.load_model('model.h5')

#     train_predictions = model.predict(X_train1).flatten()
#     train_results = pd.DataFrame({'Actual': y_train1.flatten(), 'Predicted': train_predictions})
#     print(train_results)

#     #get r2 score
#     print('r2 score : ', r2_score(y_train1, train_predictions))

#     #plot predictions and actual value depending on the time

#     fig = go.Figure()
#     fig.add_trace(go.Scatter(x=data['Date'][:60000], y=train_results['Actual'], mode='lines', name='Actual'))
#     fig.add_trace(go.Scatter(x=data['Date'][:60000], y=train_results['Predicted'], mode='lines', name='Predicted'))
#     fig.update_layout(title='Actual and Predicted values', xaxis_title='Date', yaxis_title='Temperature')
#     fig.show()

IndentationError: expected an indented block (1951313344.py, line 12)