In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("..")  # Move up one level to the project root

In [3]:
import pandas as pd
import plotly.express as px
from src.time_series.preprocessing import PrePro
from src.time_series.validation import Validation
from src.time_series.analysis import Analysis
from statsmodels.tsa.stattools import acf
import plotly.graph_objects as go
from statsmodels.tsa.seasonal import STL
import statsmodels.api as sm
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LinearRegression

In [4]:
print(os.getcwd())

/Users/omare/Desktop/personal_project/time-series


In [42]:
#"/Users/omare/Desktop/personal_project/time-series/data/DailyDelhiClimateTrain.csv"

file_path_train = "./data/DailyDelhiClimateTrain.csv"
file_path_test = "./data/DailyDelhiClimateTest.csv"

In [43]:
train_data = pd.read_csv(file_path_train)
test_data = pd.read_csv(file_path_test)

In [44]:
prepro = PrePro()
validation = Validation()
analysis = Analysis()

In [45]:
prepro.to_datetime(df=train_data, name_column="date")
prepro.to_datetime(df=test_data, name_column="date")

In [46]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1462 non-null   object        
 1   meantemp       1462 non-null   float64       
 2   humidity       1462 non-null   float64       
 3   wind_speed     1462 non-null   float64       
 4   meanpressure   1462 non-null   float64       
 5   date_datetime  1462 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 68.7+ KB


In [47]:
validation.plot_time_series(df=train_data, x="date", y="meantemp", title=f"time series temperature")

In [48]:
rolling_window_plot, adfuller_test=analysis.check_stationarity(train_data, name="meantemp", window_size=12)

Approximate stationarity 

 
 Mean 1: 24.88719668898111, 
 Mean 2: 26.103844622542415 

 
 Variance 1: 56.3065336564004, 
 Variance 2: 51.01552894002579


In [49]:
analysis.acf_plot(train_data, "meantemp", 3000)

In [50]:
# I take the seasonality as the difference in lag between two peaks
fig, result = analysis.decomposition_time_series(train_data, "meantemp", 362, "Mean Temp" )

In [51]:
fig

In [52]:
residuals = pd.DataFrame({"meantemp":result.resid, "date_datetime":train_data["date_datetime"]})
trend = pd.DataFrame({"meantemp":result.trend, "date_datetime":train_data["date_datetime"]})
seasonality = pd.DataFrame({"meantemp":result.seasonal, "date_datetime":train_data["date_datetime"]})

In [53]:
validation.plot_time_series(df=residuals, x="date_datetime", y="meantemp", title=f"time series temperature")

In [54]:
rolling_window_plot, adfuller_test=analysis.check_stationarity(residuals, name="meantemp", window_size=12)

Approximate stationarity 

 
 Mean 1: 0.025347510193403883, 
 Mean 2: -0.10292529207279326 

 
 Variance 1: 1.8227845827830578, 
 Variance 2: 1.7930337979031556


In [55]:
rolling_window_plot

In [56]:
adfuller_test

(-8.925036534829674,
 1.0167130843554541e-14,
 'Reject the null hypothesis. The time series is likely stationary.')

In [57]:
analysis.acf_plot(residuals, "meantemp", 40)
# Mi sembra che sia autoregressiva. Non so se c'è anche una componente di mooving average.

In [58]:
analysis.pacf_plot(residuals, "meantemp", 40)

In [59]:
train_trend_values

array([[25.0804723 ],
       [25.07904093],
       [25.07761134],
       ...,
       [28.08549546],
       [28.09068027],
       [28.09586505]])

In [60]:
train_trend_timestamps = trend['date_datetime'].apply(lambda x: x.timestamp()).values.reshape(-1, 1)
test_trend_timestamps = test_data['date_datetime'].apply(lambda x: x.timestamp()).values.reshape(-1, 1)

# # Create and fit the linear regression model
linear_model = LinearRegression()
linear_model.fit(train_trend_timestamps, trend['meantemp'])

# Predict the trend values for the test_data set
forecast_trend_values = linear_model.predict(test_trend_timestamps)
forecast_trend = pd.DataFrame({'date_datetime': test_data['date_datetime'], 'forecast': forecast_trend_values})
forecast_trend["date_datetime"] = pd.to_datetime(forecast_trend['date_datetime'])

In [61]:
forecast_residuals = analysis.forecast_component(residuals,test_data, (1,0,1),)
#forecast_trend = analysis.forecast_component(trend, test_data, (1,4,1))
forecast_residuals.reset_index(inplace=True)
#forecast_trend.reset_index(inplace=True)

In [62]:
start_date = test_data["date_datetime"].iloc[0]
end_date = test_data["date_datetime"].iloc[-1]
print(start_date, end_date)


2017-01-01 00:00:00 2017-04-24 00:00:00


In [63]:
start_date_seasonal = "2013-01-01"
end_date_seasonal = "2013-04-24"
start_date_seasonal = pd.to_datetime(start_date_seasonal)
end_date_seasonal = pd.to_datetime(end_date_seasonal)

forecast_seasonal = seasonality[(seasonality['date_datetime'] >= start_date_seasonal) & (seasonality['date_datetime'] <= end_date_seasonal)]

# Display the resulting DataFrame
forecast_seasonal.reset_index(inplace=True)

In [64]:
forecast = pd.DataFrame()
forecast["meantemp"] = forecast_trend["forecast"] + forecast_seasonal["meantemp"]+ forecast_residuals["forecast"]
forecast["date_datetime"] = forecast_trend["date_datetime"]

In [68]:
mse = validation.mse_error(forecast, test_data, "meantemp" )
print(f"Mean squared error: {round(mse,2)}  \n Root Mean Squared error: {round(np.sqrt(mse),2)}")

Mean squared error: 14.2 % 
 Root Mean Squared error: 3.77


In [66]:
validation.evaluate_forecast_plot(forecast, test_data, "meantemp", "Mean Temperature" )