# Forecasting Weather using Auto Regression and Moving Average model (ARIMA)

## PRANAV THIAGARAJAN UMAPATHY 
## Student ID- 220366757

### Importing required packages

In [None]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import scipy.io
import seaborn as sns
sns.set_style("darkgrid")
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
from xgboost import XGBRegressor
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
#Reading the CSV file into a Pandas DataFrame
weather_data = pd.read_csv("weather_dt.csv")

In [None]:
weather_data.info()

In [None]:
#Formatting the datetime column
weather_data["datetime"] = pd.to_datetime(weather_data["datetime"])

In [None]:
#Creating a new df with windspeed and datetime column
wind_df = weather_data[['datetime', 'windspeed']].copy()

# Display the first few rows of the new DataFrame
print(wind_df.head())

In [None]:
#Filter the data for required period
start_date = pd.to_datetime("2021-10-01")
end_date = pd.to_datetime("2023-09-30")
filtered_data = wind_df[(wind_df["datetime"] >= start_date) & (wind_df["datetime"] <= end_date)]

In [None]:
#Convert 'datetime' column to datetime format
filtered_data['datetime'] = pd.to_datetime(filtered_data['datetime'], format='%Y-%m-%d')

#Assume 'windspeed' is the target variable
X = filtered_data[['datetime']]
y = filtered_data['windspeed']

#Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Sort the training and testing sets based on 'datetime'
X_train = X_train.sort_values(by='datetime')
y_train = y_train.loc[X_train.index]

X_test = X_test.sort_values(by='datetime')
y_test = y_test.loc[X_test.index]

# Create a time series from the 'windspeed' column for both train and test sets
train_series = pd.Series(y_train.values, index=X_train['datetime'])
test_series = pd.Series(y_test.values, index=X_test['datetime'])

### Forecast

In [None]:
# Feature engineering: create lag features
def create_lag_features(data, lag):
    for i in range(1, lag + 1):
        data[f'lag_{i}'] = data['windspeed'].shift(i)
    return data

# Choose the lag value
lag = 5
X_train = create_lag_features(pd.DataFrame({'windspeed': y_train}), lag).dropna()
X_test = create_lag_features(pd.DataFrame({'windspeed': y_test}), lag).dropna()

# Define features and target variable for training
X_train_features = X_train.drop(['windspeed'], axis=1)
y_train_features = X_train['windspeed']

# Define features and target variable for testing
X_test_features = X_test.drop(['windspeed'], axis=1)
y_test_features = X_test['windspeed']

# Initialize and fit the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100)
model.fit(X_train_features, y_train_features)

# Make predictions on the test set
y_pred = model.predict(X_test_features)


# Plot the actual vs. predicted values
plt.figure(figsize=(12, 6))
plt.plot(test_series.index[:len(y_test_features)], y_test_features, label='Actual', color='blue')
plt.plot(test_series.index[:len(y_test_features)], y_pred, label='Predicted', linestyle='dashed', color='green')
plt.title('Gradient Boost Forecasting')
plt.xlabel('Date')
plt.ylabel('Windspeed')
plt.legend()
plt.show()


### Model Evaluation

In [None]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(y_test_features, y_pred))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate MSE
mse = mean_squared_error(y_test_features, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# Calculate MAE
mae = mean_absolute_error(y_test_features, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# Gradient boosting method

Split the dataset into features (X) and the target variable (y).

Create lag features: Incorporate lagged values of the target variable as features to capture temporal dependencies, such as $y_{t-1}$ and $y_{t-2}$

## Initializing the model

$F_0(x) = argmin_{\gamma} \sum_{i=1}^{n} L(y_{i},\gamma)$

here, the $F_0(x)$ is the initial estimate, either the mean or median of the windspeed values

L is the loss function, which measures the difference between the predicted and actual values.

## Compute the residuals

$r_{t} = y_{t} - F_{m-1} (x_{t})$

residuals can be computed by subtracting the predicted value from the actual target variable at time t

## Training with a weak learner model (Decision tree)

Train a weak learner to predict the residuals $r_t$ using features X it is done to predict the residuals of the current ensemble model

if $F_{m-1}(x_{t})$ is the current ensemble model at iteration m-1 the residuals $r_{t}$ is computed by 

$r_t= y_t - F_{m-1}(x_{t})$

residuals at time t which represent the difference between the actual target variable $y_t$ and the prediction from the current ensemble model $F_{m-1}(x_{t})$

$y_t$ is the actual target variable at time t

The prediction from the current ensemble model at iteration m-1 at features $x_t$ at time t



## Updating the model

The predictions from the weak learner model $h_m(x_t)$ is added to the current model it is scaled by learning rate $\gamma_m$

$F_m(x_t) = F_m-1(x_t)+\gamma_m h_m(x_t)$

The updated model $F_m(x_t)$ is now the ensemble model at iteration m that better captures the target variable's patterns.


$F_m(x_t)$ The updated ensemble model at iteration m for the features $x_t$ at time t

$F_m-1(x_t)$  The ensemble model from the previous iteration (m−1) for the features $x_t$ at time t.

$\gamma_m$ The learning rate for the weak learner at iteration m

$h_m(x_t)$ The prediction from the weak learner at iteration m for the features $x_t$ at time t



## Final model

After M iterations, where M is the predefined number of weak learners, the final model is the sum of all the weak learners contributions

$F(x_t) = F_0(x_t) + \gamma_1 h_1(x_t) + \gamma_2 h_2(x_t)+.... \gamma_M h_M(x_t)$

here, $F(x_t)$ represents the final ensemble model's prediction at time t

$F_0(x_t)$ is the initial model or the starting point (mean of the windspeed)

$\gamma_1, \gamma_2, \gamma_M$ are the step sizes for each weak learner

$h_1 (x_t),  h_2(x_t), h_M(x_t) $ are the predictions from each weak learner.