# Ten minute dataset XGBOOST
Importing and setting base directory

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from darts import TimeSeries
from darts.models import NLinearModel
from darts.utils.missing_values import extract_subseries
from darts.dataprocessing.transformers.scaler import Scaler
from darts.metrics.metrics import mae, mse, mape
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

# Define station information
stations = {
    '93439': {'icao': 'NZWNA', 'name': 'Wellington Aero'},
    '93110': {'icao': 'NZAAA', 'name': 'Auckland Aero'},
    '93831': {'icao': 'NZQNA', 'name': 'Queenstown Aero'},
    '93781': {'icao': 'NZCHA', 'name': 'Christchurch Aero'},
    '93245': {'icao': 'NZAPA', 'name': 'Taupo Airport Aws'}
}

# List to store DataFrames for each station
station_dfs = []

# Directory where the CSV files are stored
base_dir = r"C:\Users\sidha\Desktop\Final Dissertatoin\10min\ten_min.csv"

## Reading CSV files for different stations

In [None]:
# Concatenate all station DataFrames along rows, keeping the time and station_id as a multi-index
combined_df = pd.read_csv(file_path)

## Setting covariates and target

In [None]:
# Reset the multi-index and set `time` as the index
combined_df = combined_df.reset_index().set_index('time').dropna()

# Define covariate categories
future_covariates_wrf = [
    'wspk10m', 'uuu30m', 'vvv10m', 'lapprs900hPa', 'uuu10m',
    'wspk115m', 'vvv850hPa', 'wspk850hPa', 'mgust', 'vvv30m',
    'swdown', 'uuu500m'
]
future_covariates_ecmwf_ensemble = [
    'v100m', 'tdwpt', 'f10m', 'g10m', 'gust10_inst', 'f100m',
    'tmax', 'u100m', 'u10m', 'v10m'
]
future_covariates_ecmwf_single = [
    'u100m', 'f100m', 'rh', 'ws850p', 'g10m', 'u10m',
    'u925p', 'u850p', 'f10m', 'v925p'
]
past_covariates = [
    'Data_Time_Day__', 'Data_Time_Hour_',
    'Data_Time_Month', 'Data_Time_Year_'
]
static_covariates = ['station_id']

# Add suffixes to future covariates
future_covariates_ecmwf_ensemble = [col + '_EE' for col in future_covariates_ecmwf_ensemble]
future_covariates_ecmwf_single = [col + '_ES' for col in future_covariates_ecmwf_single]

# Combine all future covariates
future_covariates = future_covariates_wrf + future_covariates_ecmwf_ensemble + future_covariates_ecmwf_single

# Define target variable
target = 'WindSpd_10MnAvg'

## Creating a timeseries object grouped by station id.

In [None]:
# Ensure `time` is the index and reset it for processing
combined_df = combined_df.reset_index()

# Create TimeSeries objects grouped by `station_id`, embedding static covariates
series_list = TimeSeries.from_group_dataframe(
    combined_df,
    time_col="time",
    group_cols="station_id",  # Group by station
    static_cols=["Stn_Numeric_ID_"],  # Static covariates (if any)
    value_cols=[target] + past_covariates + future_covariates,  # Time-varying columns
    fill_missing_dates=True,  # Fill missing dates to ensure consistent frequency
    freq="10T",  # Specify the frequency explicitly (10-minute intervals in this case)
)

In [None]:
from darts.dataprocessing.transformers.static_covariates_transformer import StaticCovariatesTransformer

# Initialize and apply transformer
static_cov_transformer = StaticCovariatesTransformer()
series_list_transformed = static_cov_transformer.fit_transform(series_list)

### Splitting Data into 20/20/60 ratio(test/valid/train)

In [None]:
# Define train, validation, and test split ratios
train_ratio = 0.6
valid_ratio = 0.2

# Split the data into train, validation, and test sets
stations_splits = {}
for ts in series_list_transformed:
    station_id = ts.static_covariates['station_id'].iloc[0]  # Get the station ID from static covariates
    n_total = len(ts)
    n_train = int(train_ratio * n_total)
    n_valid = int(valid_ratio * n_total)

    # Perform the splits
    train_ts = ts[:n_train]
    valid_ts = ts[n_train:n_train + n_valid]
    test_ts = ts[n_train + n_valid:]

    # Store the splits
    stations_splits[station_id] = {'train': train_ts, 'valid': valid_ts, 'test': test_ts}

    # Verification
    total_length = len(train_ts) + len(valid_ts) + len(test_ts)
    assert total_length == n_total, f"Length mismatch for station {station_id}"
    assert len(set(train_ts.time_index).intersection(valid_ts.time_index)) == 0, f"Overlap between train and valid for station {station_id}"
    assert len(set(valid_ts.time_index).intersection(test_ts.time_index)) == 0, f"Overlap between valid and test for station {station_id}"
    assert len(set(train_ts.time_index).intersection(test_ts.time_index)) == 0, f"Overlap between train and test for station {station_id}"
    assert train_ts.static_covariates['station_id'].iloc[0] == station_id, f"Static covariate mismatch in train set for station {station_id}"
    assert valid_ts.static_covariates['station_id'].iloc[0] == station_id, f"Static covariate mismatch in valid set for station {station_id}"
    assert test_ts.static_covariates['station_id'].iloc[0] == station_id, f"Static covariate mismatch in test set for station {station_id}"

print("All checks passed successfully.")

In [None]:
plt.figure(figsize=(15,5))
train_ts[target].plot()
valid_ts[target].plot()
test_ts[target].plot()

In [None]:
# Extraction of long enough time series for Darts
def extract_complete_subseries(ts, target, past_covariates, future_covariates, min_timesteps):
    """
    Extract subseries with a minimum number of timesteps.
    """
    # Extract subseries that are long enough
    extracted_target = [subserie[target] for subserie in extract_subseries(ts)
        if subserie.n_timesteps >= min_timesteps]

    # Extract corresponding past covariates
    extracted_past_cov = [subserie[past_covariates] for subserie in extract_subseries(ts)
        if subserie.n_timesteps >= min_timesteps]

    # Extract corresponding future covariates
    extracted_future_cov = [subserie[future_covariates] for subserie in extract_subseries(ts)
        if subserie.n_timesteps >= min_timesteps]

    return extracted_target, extracted_past_cov, extracted_future_cov


In [None]:
# Function to extract station-specific subsets
def extract_station_subsets(splits, target, past_covariates, future_covariates, min_timesteps):
    """
    Extract train, validation, and test subsets for a specific station.
    """
    train_target, train_past_cov, train_future_cov = extract_complete_subseries(
        splits['train'], target, past_covariates, future_covariates, min_timesteps
    )
    valid_target, valid_past_cov, valid_future_cov = extract_complete_subseries(
        splits['valid'], target, past_covariates, future_covariates, min_timesteps
    )
    test_target, test_past_cov, test_future_cov = extract_complete_subseries(
        splits['test'], target, past_covariates, future_covariates, min_timesteps
    )
    return (
        train_target, train_past_cov, train_future_cov,
        valid_target, valid_past_cov, valid_future_cov,
        test_target, test_past_cov, test_future_cov
    )


## Loop to fit and backtest stations for different output chunk lengths

In [None]:
import pandas as pd
from darts.models import XGBModel, NLinearModel
from darts.metrics import mae, mse, r2_score
import numpy as np
import matplotlib.pyplot as plt
import time

# Define the ranges for input and output chunk lengths
input_chunk_length = 512
output_chunk_lengths = range(18,19)  # Use range(6) for consistency

# Initialize a list to store metrics for both models
metrics_list = []

# Train and evaluate models for each output chunk length
for output_chunk_length in output_chunk_lengths:
    print(f"Training with input_chunk_length={input_chunk_length} and output_chunk_length={output_chunk_length}")

    for model_type in ["XGBoost"]:  # Iterate over model types
        print(f"Using model type: {model_type}")

        # Initialize a dictionary to store models for each station
        models = {}

        for station_id, splits in stations_splits.items():
            print(f"  Training for station: {station_id}")

            # Extract subsets specific to this station
            (
                train_target, train_past_cov, train_future_cov,
                valid_target, valid_past_cov, valid_future_cov,
                test_target, test_past_cov, test_future_cov
            ) = extract_station_subsets(
                splits, target, past_covariates, future_covariates,
                min_timesteps=input_chunk_length + output_chunk_length
            )

            # Initialize the model based on type
            if model_type == "XGBoost":
                model = XGBModel(
                    lags=list(range(-input_chunk_length, 0)),  # Match input chunk
                    lags_past_covariates=list(range(-input_chunk_length, 0)),
                    lags_future_covariates=list(range(0, output_chunk_length)),  # Future covariates
                    output_chunk_length=output_chunk_length,
                    use_static_covariates=True,
                    random_state=42
                )
            elif model_type == "NLinear":
                model = NLinearModel(
                    input_chunk_length=input_chunk_length,
                    output_chunk_length=output_chunk_length,
                    random_state=42
                )

            # Measure the start time
            start_time = time.time()

            # Fit the model using station-specific data
            model.fit(
                series=train_target,
                past_covariates=train_past_cov,
                future_covariates=train_future_cov,
                val_series=valid_target,
                val_past_covariates=valid_past_cov,
                val_future_covariates=valid_future_cov
            )

            models[station_id] = model

            # Evaluate using backtesting
            metrics = model.backtest(
                series=test_target,
                past_covariates=test_past_cov,
                future_covariates=test_future_cov,
                forecast_horizon=output_chunk_length,
                metric=[mae, mse, r2_score],
                last_points_only=True,
                retrain=False
            )

            # Calculate average metrics
            average_metrics = np.mean(metrics, axis=0)
            print(f"    Station {station_id} - Average Metrics (MAE, MSE, R2): {average_metrics}")
            # Measure the end time and calculate the duration
            end_time = time.time()
            training_time = end_time - start_time
            print(f"    Training time for station {station_id}: {training_time:.2f} seconds")
            # Append metrics for this station, configuration, and model type to the list
            metrics_list.append({
                "Station ID": station_id,
                "Model Type": model_type,
                "Input Chunk Length": input_chunk_length,
                "Output Chunk Length": output_chunk_length,
                "MAE": average_metrics[0],
                "MSE": average_metrics[1],
                "R2 Score": average_metrics[2],
                "Training Time (s)": training_time
            })

# Convert metrics list to a DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Save the DataFrame to a CSV file
metrics_df.to_csv("metrics_combined_xgboost_nlinear_10min.csv", index=False)

# Print the resulting DataFrame
print(metrics_df)


## Plot and save metrics to csv file

In [None]:
# Ensure all station IDs are mapped
scaled_to_original_mapping = {
    0.000000: 93439,
    2.0: 93831,
    3.0: 93110,
    4.0: 93245,
    1.000000: 93781,
}

# Replace scaled Station IDs with original numbers in the DataFrame
metrics_xgboost_df["Station ID"] = metrics_xgboost_df["Station ID"].map(scaled_to_original_mapping)

# Drop rows with unmapped Station IDs
metrics_xgboost_df = metrics_xgboost_df.dropna(subset=["Station ID"])

# Convert Station IDs to integers
metrics_xgboost_df["Station ID"] = metrics_xgboost_df["Station ID"].astype(int)

# Plot MAE vs. Output Chunk Length for each station
for station_id in metrics_xgboost_df["Station ID"].unique():
    station_data = metrics_xgboost_df[metrics_xgboost_df["Station ID"] == station_id]
    plt.plot(
        station_data["Output Chunk Length"],
        station_data["MAE"],
        label=f"Station {station_id}"
    )

# Customize the plot
plt.xlabel("Output Chunk Length")
plt.ylabel("MAE")
plt.title("MAE vs Output Chunk Length for All Stations")
plt.legend(title="Station")
plt.grid(True)
plt.show()

# Save the updated DataFrame to a CSV
metrics_xgboost_df.to_csv("metrics_xgboost_10_min.csv", index=False)