In [11]:
import math
import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import matplotlib.pyplot as plt # this is used for the plot the graph 
import matplotlib.dates as mdates
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split # to split the data into two parts
#from sklearn.cross_validation import KFold # use for cross validation
from sklearn.preprocessing import StandardScaler # for normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline # pipeline making
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error,r2_score
from datetime import timedelta

## for Deep-learing:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
#from keras.utils import np_utils
import itertools
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Dropout
from custom_transforms.transforms import *
from models.LSTM1 import LSTM1

In [12]:
df = pd.read_csv('../input/teresopolis-2016-2023-precos-medios.csv', sep=',', 
                 parse_dates={'dt' : ['Ano', 'Mês', 'Dia']},
                 low_memory=False, na_values=['nan','?'], index_col='dt')

filtered_alface_crespa_roca = df[df['Produto'] == 'Alface Crespa - Roça'][['Preco_unitario']]
filtered_alface_crespa_roca = filtered_alface_crespa_roca.resample('W').mean()
filtered_alface_crespa_roca = filtered_alface_crespa_roca.interpolate(method='linear')
filtered_alface_crespa_roca.index = filtered_alface_crespa_roca.index.tz_localize("UTC")
filtered_alface_crespa_roca.sort_values(by=['dt'], inplace=True)
filtered_alface_crespa_roca["first_day_week"] = filtered_alface_crespa_roca.index - filtered_alface_crespa_roca.index.weekday * timedelta(days=1)

In [13]:
weather_df = pd.read_csv('../processed_data/weather_2016_2023.csv', index_col=0)
weather_df.index = pd.to_datetime(weather_df.index, utc=True)
for col in weather_df.columns:
    weather_df[col] = weather_df[col].replace(-9999, None)
weather_df = weather_df[(weather_df.index >= '2016-04-24') & (weather_df.index <= '2023-09-24')]
weather_weekly_df = pd.DataFrame(weather_df[[ 'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)', 'RADIACAO GLOBAL (Kj/m²)', 'TEMPERATURA DO PONTO DE ORVALHO (°C)']].resample('W').mean())



weather_weekly_df["first_day_week"] = (weather_weekly_df.index - weather_weekly_df.index.weekday * timedelta(days=1))

In [14]:
#weather_weekly_df['RADIAÇÃO GLOBAL (Kj/m²)'] = weather_df['RADIACAO GLOBAL (Kj/m²)'].resample('W').max()
weather_weekly_df['RADIACAO GLOBAL (Kj/m²)']=weather_df['RADIACAO GLOBAL (Kj/m²)'].resample('W').max()

In [15]:
price_weather_df = pd.merge(filtered_alface_crespa_roca, weather_weekly_df, on='first_day_week', how='left')
price_weather_df.index = filtered_alface_crespa_roca.index

In [21]:
model = LSTM1()
model.run(price_weather_df, ['Preco_unitario', 'TEMPERATURA DO PONTO DE ORVALHO (°C)'], 8, 8, 8)

0.41633084064984616

In [38]:
model.rmse_by_timestep['RMSE'].tolist()


[0.13698000327555349,
 0.24849908920310249,
 0.3432056398115358,
 0.4103404104194954,
 0.45075296993221325,
 0.4973852026728478,
 0.5233395316728121,
 0.5413337603302846]

In [18]:
all_rmses = []
rmses_by_in_size = []
for i in range(1, 8):
    model.run(price_weather_df, ['Preco_unitario', 'TEMPERATURA DO PONTO DE ORVALHO (°C)'], i, 8, 8)
    rmses_by_in_size.append(model.rmse)
    all_rmses.append(model.rmse_by_timestep)



train_size: 272
validation_size: 38
test_size: 78
values_train: (272, 5)
values_validation: (38, 5)
values_test: (78, 5)
train_percentage: 0.7010309278350515
validation_percentage: 0.0979381443298969
test_percentage: 0.20103092783505155
(264, 1, 2) (264, 8) (70, 1, 2) (70, 8)
Epoch 1/100
4/4 - 4s - loss: 0.1031 - val_loss: 0.4414 - 4s/epoch - 1s/step
Epoch 2/100
4/4 - 0s - loss: 0.0923 - val_loss: 0.4180 - 60ms/epoch - 15ms/step
Epoch 3/100
4/4 - 0s - loss: 0.0817 - val_loss: 0.3941 - 62ms/epoch - 15ms/step
Epoch 4/100
4/4 - 0s - loss: 0.0716 - val_loss: 0.3695 - 60ms/epoch - 15ms/step
Epoch 5/100
4/4 - 0s - loss: 0.0618 - val_loss: 0.3439 - 64ms/epoch - 16ms/step
Epoch 6/100
4/4 - 0s - loss: 0.0523 - val_loss: 0.3177 - 60ms/epoch - 15ms/step
Epoch 7/100
4/4 - 0s - loss: 0.0437 - val_loss: 0.2913 - 60ms/epoch - 15ms/step
Epoch 8/100
4/4 - 0s - loss: 0.0361 - val_loss: 0.2656 - 60ms/epoch - 15ms/step
Epoch 9/100
4/4 - 0s - loss: 0.0300 - val_loss: 0.2417 - 60ms/epoch - 15ms/step
Epoch 1

In [19]:
all_rmses

[       RMSE
 1  0.070117
 2  0.188933
 3  0.277104
 4  0.331244
 5  0.376718
 6  0.420944
 7  0.457033
 8  0.484810,
        RMSE
 1  0.098415
 2  0.210520
 3  0.289930
 4  0.348359
 5  0.394979
 6  0.437756
 7  0.473359
 8  0.503674,
        RMSE
 1  0.102179
 2  0.212098
 3  0.295032
 4  0.357815
 5  0.400907
 6  0.449950
 7  0.491304
 8  0.518125,
        RMSE
 1  0.114436
 2  0.224508
 3  0.302203
 4  0.364609
 5  0.412979
 6  0.467372
 7  0.506060
 8  0.531732,
        RMSE
 1  0.119644
 2  0.227321
 3  0.316652
 4  0.370983
 5  0.432093
 6  0.479271
 7  0.512614
 8  0.532264,
        RMSE
 1  0.133501
 2  0.238803
 3  0.331629
 4  0.396830
 5  0.445450
 6  0.489547
 7  0.519186
 8  0.543240,
        RMSE
 1  0.136980
 2  0.248499
 3  0.343206
 4  0.410340
 5  0.450753
 6  0.497385
 7  0.523340
 8  0.541334]

In [20]:
rmses_by_in_size

[0.35712518841503593,
 0.37061634583151726,
 0.3781944256003429,
 0.3887481562459601,
 0.39701484879233406,
 0.40919766192935386,
 0.41633084064984616]