In [1]:
!pip install pmdarima
!pip install streamlit

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4
Collecting streamlit
  Downloading streamlit-1.42.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.1-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━

In [14]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
import math

# Função para calcular o erro dos modelos
def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    return pd.DataFrame({'Model Name': model_name, 'MAE': mae, 'MAPE': mape, 'RMSE': rmse}, index=[0])

# Carregar os dados
@st.cache_data
def load_data():
    df = pd.read_excel('preco_petroleo_ipea_base_2015_2025.xlsx')
    df.set_index('Data', inplace=True)
    df.index = pd.to_datetime(df.index)
    df = df.asfreq('D')  # Definir a frequência do índice de datas como diária
    return df

df = load_data()

# Exibir os dados
st.title('Análise de Preços do Petróleo')
st.write(df.head())

# Gráfico de distribuição de preços
st.subheader('Distribuição de Preços')
fig, ax = plt.subplots()
sns.histplot(df['Preco'], kde=False, ax=ax)
st.pyplot(fig)

# Engenharia de recursos
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['week_of_year'] = df.index.isocalendar().week
df['day_of_week'] = df.index.dayofweek

# Gráfico de linha dos preços
st.subheader('Preço do Petróleo por Mês')
fig, ax = plt.subplots(figsize=(14, 7))
sns.lineplot(x=df.index, y='Preco', data=df, ax=ax)
st.pyplot(fig)

# Separar dados em treino e teste
df = df['Preco'].reset_index()
df.rename(columns={'Preco': 'y', 'Data': 'ds'}, inplace=True)
df['ds'] = pd.to_datetime(df['ds'])  # Garantir que a coluna 'ds' seja do tipo datetime
# Create features for linear regression based on date components
df['year'] = df['ds'].dt.year
df['month'] = df['ds'].dt.month
df['day'] = df['ds'].dt.day
df['week_of_year'] = df['ds'].dt.isocalendar().week
df['day_of_week'] = df['ds'].dt.dayofweek
# Split the data
X_train_lr = df[df['ds'] <= '2024-08-10'].drop(columns=['y', 'ds'])
y_train_lr = df[df['ds'] <= '2024-08-10']['y']
X_test_lr = df[df['ds'] > '2024-08-10'].drop(columns=['y', 'ds'])
y_test_lr = df[df['ds'] > '2024-08-10']['y']

#------------------------------------------------------------

# ... (previous code) ...

# Modelo ARIMA
st.subheader('Modelo ARIMA')
arima_model = ARIMA(X_train.set_index('ds')['y'], order=(2, 1, 24))
arima_fit = arima_model.fit()
forecast = arima_fit.get_forecast(steps=len(X_test))
forecast_values = forecast.predicted_mean

# Instead of dropping NaNs and aligning, fill NaNs with previous values:
forecast_values = forecast_values.fillna(method='ffill')

# Create a DataFrame for forecast values using the original X_test dates
forecast_df = pd.DataFrame({
    'ds': X_test['ds'],
    'forecast_values': forecast_values
})

# Reset index of both y_test and forecast_df to avoid duplicate index issues
y_test = y_test.reset_index(drop=True)
forecast_df = forecast_df.reset_index(drop=True)

# Ensure y_test and forecast_values have the same length and remove NaNs
# Remove rows with NaNs in either y_test or forecast_values
# This creates a boolean mask where True indicates a valid (non-NaN) value
valid_mask = y_test.notna() & forecast_df['forecast_values'].notna()

# If valid_mask is all False, it means there's no valid data to compare
# In this case, skip the error calculation and print a warning
if not valid_mask.any():
    st.warning("No valid data points to compare. Skipping ARIMA error calculation.")
else:
    # Apply the mask to both y_test and forecast_values
    y_test = y_test[valid_mask]
    forecast_values = forecast_df['forecast_values'][valid_mask].values

    # Calculate the error
    arima_result = ml_error('Arima Model', y_test, forecast_values)
    st.write(arima_result)

#------------------------------------------------------------

# Gráfico de previsão ARIMA
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(x='ds', y='y', data=X_test, label='Real', marker='o', color='blue', linewidth=2, ax=ax)
sns.lineplot(x='ds', y='forecast_values', data=forecast_df, label='Predict', marker='o', color='orange', linewidth=2, ax=ax)
plt.ylim(50, 100)
plt.xlabel('Data')
plt.ylabel('Valor')
plt.title('Performance ARIMA - Dados Reais e Preditos')
plt.legend()
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
st.pyplot(fig)

# Modelo de Regressão Linear
st.subheader('Modelo de Regressão Linear')

# Ensure 'ds' is a datetime object
df['ds'] = pd.to_datetime(df['ds'])

# Filter data for training and testing based on date
X_train_lr = df[df['ds'] <= '2024-08-10'].drop(columns=['y', 'ds'])
y_train_lr = df[df['ds'] <= '2024-08-10']['y']
X_test_lr = df[df['ds'] > '2024-08-10'].drop(columns=['y', 'ds'])
y_test_lr = df[df['ds'] > '2024-08-10']['y']


# Drop rows with NaN values in y_train_lr or X_train_lr
train_mask = y_train_lr.notna() & X_train_lr.notna().all(axis=1)
X_train_lr = X_train_lr[train_mask]
y_train_lr = y_train_lr[train_mask]

# Drop rows with NaN values in y_test_lr or X_test_lr
test_mask = y_test_lr.notna() & X_test_lr.notna().all(axis=1)
X_test_lr = X_test_lr[test_mask]
y_test_lr = y_test_lr[test_mask]


modelo = LinearRegression()
modelo.fit(X_train_lr, y_train_lr)  # Now fit with cleaned data
y_pred = modelo.predict(X_test_lr)
lr_result = ml_error('Linear Regression', y_test_lr, y_pred)
st.write(lr_result)

# Gráfico de previsão Regressão Linear
forecast_lr_df = pd.DataFrame({
    'ds': X_test_lr.index,  # Use the index of X_test_lr for dates
    'y': y_test_lr,        # Use y_test_lr which has the same length as y_pred
    'y_pred': y_pred
})
forecast_lr_df = forecast_lr_df.reset_index() # Reset index to align 'ds' as a column
forecast_lr_df['ds'] = pd.to_datetime(forecast_lr_df['ds'])  # Convert 'ds' to datetime if needed

fig, ax = plt.subplots(figsize=(12, 6))
sns.lineplot(x='ds', y='y', data=forecast_lr_df, label='Real', marker='o', color='blue', linewidth=2, ax=ax)
sns.lineplot(x='ds', y='y_pred', data=forecast_lr_df, label='Predict', marker='o', color='orange', linewidth=2, ax=ax)
plt.ylim(50, 100)
plt.xlabel('Data')
plt.ylabel('Valor')
plt.title('Performance Modelo Linear Regression - Dados Reais e Preditos')
plt.legend()
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
st.pyplot(fig)

# fim

2025-02-18 20:45:32.223 No runtime found, using MemoryCacheStorageManager
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  forecast_values = forecast_values.fillna(method='ffill')


DeltaGenerator()