In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [62]:
# Load dataset
df = pd.read_csv("GlobalWeatherRepository.csv")  # Adjust filename accordingly
# print(df.head())

# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Encode location_name column
df['location_name_encoded'] = label_encoder.fit_transform(df['location_name'])
df['moon_phase_encoded'] = label_encoder.fit_transform(df['moon_phase'])
# Drop the original location_name column
df = df.drop(columns=['location_name'])
df = df.drop(columns=['moon_phase'])

df['last_updated'] = pd.to_datetime(df['last_updated'])
df = df.sort_values('last_updated')

# Extract date-based features from 'last_updated'
df['hour'] = df['last_updated'].dt.hour
df['month'] = df['last_updated'].dt.month
df['dayofweek'] = df['last_updated'].dt.dayofweek

# Extract moonrise, moonset, sunrise, and sunset times in minutes from midnight
df['moonrise'] = pd.to_datetime(df['moonrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonrise'], errors='coerce').dt.minute
df['moonset'] = pd.to_datetime(df['moonset'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonset'], errors='coerce').dt.minute
df['sunrise'] = pd.to_datetime(df['sunrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunrise'], errors='coerce').dt.minute
df['sunset'] = pd.to_datetime(df['sunset'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunset'], errors='coerce').dt.minute

# Create lag features (lag of 1 day and 7 days for temperature)
df['temp_lag_1'] = df['temperature_celsius'].shift(1)
df['temp_lag_7'] = df['temperature_celsius'].shift(7)

# Select relevant features
features = ['temperature_celsius', 'humidity', 'pressure_mb', 'wind_kph', 'uv_index',
            'air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_Nitrogen_dioxide',
            'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10',
            'dayofweek', 'hour', 'month', 'temp_lag_1', 'temp_lag_7', 'location_name_encoded', 
            'moonrise', 'moonset', 'sunrise', 'sunset', 'moon_phase_encoded', 'moon_illumination']
df = df[features].dropna()

# Normalize the data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(df)

  df['moonrise'] = pd.to_datetime(df['moonrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonrise'], errors='coerce').dt.minute
  df['moonrise'] = pd.to_datetime(df['moonrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonrise'], errors='coerce').dt.minute
  df['moonset'] = pd.to_datetime(df['moonset'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonset'], errors='coerce').dt.minute
  df['moonset'] = pd.to_datetime(df['moonset'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['moonset'], errors='coerce').dt.minute
  df['sunrise'] = pd.to_datetime(df['sunrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunrise'], errors='coerce').dt.minute
  df['sunrise'] = pd.to_datetime(df['sunrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunrise'], errors='coerce').dt.minute
  df['sunset'] = pd.to_datetime(df['sunset'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunset'], errors='coerce').dt.minute
  df['sunset'] = pd.to_datetime(df['su

In [63]:
### ----------------------------- ###
###        ARIMA MODEL            ###
### ----------------------------- ###
print("\nTraining ARIMA Model...")

# Train ARIMA on Temperature
arima_model = ARIMA(df['temperature_celsius'], order=(5,1,0))  # ARIMA(p,d,q)
arima_result = arima_model.fit()

# Forecast using ARIMA
arima_forecast = arima_result.forecast(steps=len(X_test))


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)



Training ARIMA Model...


  return get_prediction_index(
  return get_prediction_index(


In [64]:
### ----------------------------- ###
###        XGBOOST MODEL          ###
### ----------------------------- ###
print("\nTraining XGBoost Model...")

data =df.copy()

# Splitting data
X = data.drop(columns=['temperature_celsius'])  
y = data['temperature_celsius']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100)
xgb_model.fit(X_train, y_train)


Training XGBoost Model...


In [65]:
### ----------------------------- ###
###    MODEL PERFORMANCE METRICS  ###
### ----------------------------- ###
print("\nModel Performance:")

def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    accuracy = 100 - (mae / np.mean(y_true) * 100)
    print(f'MAE: {mae:.3f}, RMSE: {rmse:.3f}, Accuracy: {accuracy:.2f}%')

# Evaluate Each Model
xgb_predictions = xgb_model.predict(X_test)
evaluate(y_test, xgb_predictions, "XGBoost")

evaluate(y_test, arima_forecast, "ARIMA")


Model Performance:
MAE: 4.006, RMSE: 6.019, Accuracy: 78.09%
MAE: 9.016, RMSE: 12.057, Accuracy: 50.69%


In [66]:
# Prepare sequences for time series forecasting
def create_sequences(df, locations, seq_length):
    sequences, targets, locs = [], [], []
    for i in range(len(df) - seq_length):
        sequences.append(df[i:i+seq_length])
        targets.append(df[i + seq_length][0])  # Predicting temperature_celsius
        locs.append(locations.iloc[i + seq_length])  # Ensure correct indexing with iloc
    return np.array(sequences), np.array(targets), np.array(locs)

# Prepare the sequences
seq_length = 30
X, y, locs = create_sequences(data_scaled, df['location_name_encoded'], seq_length)

# Convert to PyTorch tensors
X_train, y_train, loc_train = torch.tensor(X[:-100], dtype=torch.float32), torch.tensor(y[:-100], dtype=torch.float32).unsqueeze(1), torch.tensor(locs[:-100], dtype=torch.long)
X_test, y_test, loc_test = torch.tensor(X[-100:], dtype=torch.float32), torch.tensor(y[-100:], dtype=torch.float32).unsqueeze(1), torch.tensor(locs[-100:], dtype=torch.long)

# Create PyTorch DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, locs):
        self.X = X
        self.y = y
        self.locs = locs
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.locs[idx]

train_loader = DataLoader(TimeSeriesDataset(X_train, y_train, loc_train), batch_size=16, shuffle=True)
test_loader = DataLoader(TimeSeriesDataset(X_test, y_test, loc_test), batch_size=16, shuffle=False)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_locations, embed_dim):
        super(LSTMModel, self).__init__()
        self.embed = nn.Embedding(num_locations, embed_dim)
        self.lstm = nn.LSTM(input_size + embed_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, locs):
        loc_embeds = self.embed(locs)
        x = torch.cat((x, loc_embeds.unsqueeze(1).repeat(1, x.size(1), 1)), dim=-1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Get the last output from the LSTM
        return out


In [68]:
# Define Transformer Model with Location Embedding
class TransformerModel(nn.Module):
    def __init__(self, input_size, num_heads, num_layers, hidden_dim, num_locations, embed_dim):
        super(TransformerModel, self).__init__()
        embed_dim_adjusted = ((input_size + embed_dim) // num_heads) * num_heads  # Ensure divisibility
        self.embedding = nn.Embedding(num_locations, embed_dim)
        self.input_fc = nn.Linear(input_size + embed_dim, embed_dim_adjusted)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim_adjusted, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim_adjusted, 1)
    def forward(self, x, loc):
        loc_embed = self.embedding(loc).unsqueeze(1).expand(-1, x.shape[1], -1)
        x = torch.cat((x, loc_embed), dim=2)
        x = self.input_fc(x)
        x = self.transformer_encoder(x)
        return self.fc(x[:, -1, :])

In [69]:
# Calculate number of unique locations based on the encoded locations
num_locations = len(np.unique(df['location_name_encoded']))
# print(num_locations)

# Define embedding dimension and input size
embed_dim = 8  # Dimension of location embeddings
input_size = X.shape[2]  # Number of input features without embedding
num_heads = min(4, input_size + embed_dim) if (input_size + embed_dim) % 4 == 0 else 2

# Instantiate LSTM and Transformer models with location embeddings
lstm_model = LSTMModel(input_size, hidden_size=64, num_layers=2, num_locations=num_locations, embed_dim=embed_dim)
transformer_model = TransformerModel(input_size, num_heads, num_layers=2, hidden_dim=64, num_locations=num_locations, embed_dim=embed_dim)

# Modify training function to pass location data
def train_model(model, train_loader, patience=5, min_delta=0.0005):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    best_loss = float("inf")
    patience_counter = 0
    while patience_counter < patience:
        epoch_loss = 0
        for X_batch, y_batch, loc_batch in train_loader:
            # Ensure the location indices are within the valid range
            loc_batch = loc_batch.clamp(min=0, max=num_locations-1)
            # print(f"loc_batch min: {loc_batch.min()}, loc_batch max: {loc_batch.max()}")  # Check the range of loc_batch
            optimizer.zero_grad()
            output = model(X_batch, loc_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        epoch_loss /= len(train_loader)
        print(f'Loss: {epoch_loss}')
        if best_loss - epoch_loss > min_delta:
            best_loss = epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1



In [70]:
print("Training LSTM Model...")
train_model(lstm_model, train_loader)

Training LSTM Model...
Loss: 0.00982834331258109
Loss: 0.003409601582515026
Loss: 0.0027192505361869125
Loss: 0.0023280387786547446
Loss: 0.002094842851467639
Loss: 0.0019359300785744776
Loss: 0.0018336463520437008
Loss: 0.0017420143725292996
Loss: 0.0016834848068099134
Loss: 0.0016339605020660216


In [71]:
print("Training Transformer Model...")
train_model(transformer_model, train_loader)

Training Transformer Model...
Loss: 0.01701719272890057
Loss: 0.007311205190783217
Loss: 0.004802127672553976
Loss: 0.003612351503643268
Loss: 0.0029920793305527552
Loss: 0.0026958439300238477
Loss: 0.0024952040745207997
Loss: 0.002347702252354929
Loss: 0.0022567836192945072
Loss: 0.0022100780995773254
Loss: 0.0021205046789705374
Loss: 0.0020745665907109705
Loss: 0.002044233042146208


In [72]:
# Modify evaluation function
def evaluate_model(model, X_test, y_test, loc_test):
    with torch.no_grad():
        loc_test = loc_test.clamp(min=0, max=num_locations-1)
        predictions = model(X_test, loc_test).numpy()
        y_true = y_test.numpy()
    predictions = scaler.inverse_transform(np.hstack([predictions, np.zeros((predictions.shape[0], input_size - 1))]))[:, 0]
    y_true = scaler.inverse_transform(np.hstack([y_true, np.zeros((y_true.shape[0], input_size - 1))]))[:, 0]
    mae = np.mean(np.abs(y_true - predictions))
    rmse = np.sqrt(np.mean((y_true - predictions) ** 2))
    accuracy = 100 - (mae / np.mean(y_true) * 100)
    print(f'MAE: {mae:.3f}, RMSE: {rmse:.3f}, Accuracy: {accuracy:.2f}%')


print("LSTM Model Evaluation:")
evaluate_model(lstm_model, X_test, y_test, loc_test)
print("Transformer Model Evaluation:")
evaluate_model(transformer_model, X_test, y_test, loc_test)

LSTM Model Evaluation:
MAE: 3.431, RMSE: 4.434, Accuracy: 85.10%
Transformer Model Evaluation:
MAE: 3.513, RMSE: 4.402, Accuracy: 84.74%
