# ARIMA

In [6]:
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]

## dataset：electricity

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

import time

# start time:train
train_start = time.time()
model = ARIMA(train_series[-seq_len:], order=(1, 1, 0))
model_fit = model.fit()
train_time = time.time() - train_start  #trainning time (seconds)

# start time:predict
predict_start = time.time()
forecast = model_fit.forecast(steps=pred_len)
predict_time = time.time() - predict_start  #predicting time (seconds)


# data processing
df = pd.read_csv('../DSS5104_TeamWork/data/electricity.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# Z-score 
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# parameter settings
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]
seq_len = 96
pred_len = 96
train_ratio = 0.77

# model training and evaluation
summary_results = []

for param in param_list:
    mse_list, mae_list, rmse_list, mape_list, mspe_list = [], [], [], [], []
    train_times, predict_times = [], []

    for col in df_scaled.columns:
        series = df_scaled[col].values
        train_size = int(len(series) * train_ratio)
        train_series = series[:train_size]
        test_series = series[train_size:train_size + pred_len]

        try:
            train_start = time.time()
            model = ARIMA(train_series[-seq_len:], order=param)
            model_fit = model.fit()
            train_time = time.time() - train_start

            predict_start = time.time()
            forecast = model_fit.forecast(steps=pred_len)
            predict_time = time.time() - predict_start

            mse = mean_squared_error(test_series, forecast)
            mae = mean_absolute_error(test_series, forecast)
            rmse = np.sqrt(mse)
            mape = np.mean(np.abs((test_series - forecast) / test_series)) * 100
            mspe = np.mean(np.square((test_series - forecast) / test_series)) * 100

            mse_list.append(mse)
            mae_list.append(mae)
            rmse_list.append(rmse)
            mape_list.append(mape)
            mspe_list.append(mspe)
            train_times.append(train_time)
            predict_times.append(predict_time)

        except:
            continue

    summary_results.append({
        'ARIMA (p,d,q)': param,
        'MSE (mean)': np.mean(mse_list),
        'MAE (mean)': np.mean(mae_list),
        'RMSE (mean)': np.mean(rmse_list),
        'MAPE (mean)': np.mean(mape_list),
        'MSPE (mean)': np.mean(mspe_list),
        'Train Time (mean s)': np.mean(train_times),
        'Predict Time (mean s)': np.mean(predict_times)
    })

# output summary
summary_df = pd.DataFrame(summary_results)
print("✅ARIMA output summary：")
print(summary_df)

✅ARIMA output summary：
  ARIMA (p,d,q)  MSE (mean)  MAE (mean)  RMSE (mean)  MAPE (mean)  \
0     (1, 1, 0)    1.520146    0.931935     1.122130   626.804221   
1     (2, 1, 0)    1.459520    0.912953     1.102187   556.008825   
2     (1, 1, 1)    1.478330    0.924679     1.108324   615.709813   
3     (2, 1, 1)    1.220565    0.823749     1.003301   402.740179   
4     (3, 1, 2)    0.896729    0.723728     0.890235   352.798529   

    MSPE (mean)  Train Time (mean s)  Predict Time (mean s)  
0  2.899220e+06             0.013027               0.002518  
1  1.746539e+06             0.022007               0.002270  
2  2.620643e+06             0.038710               0.002397  
3  4.969518e+05             0.081400               0.002560  
4  5.034175e+05             0.207830               0.002644  


## dataset：ILI

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

import time

# start time:train
train_start = time.time()
model = ARIMA(train_series[-seq_len:], order=(1, 1, 0))
model_fit = model.fit()
train_time = time.time() - train_start  #trainning time (seconds)

# start time:predict
predict_start = time.time()
forecast = model_fit.forecast(steps=pred_len)
predict_time = time.time() - predict_start  #predicting time (seconds)

# data processing
df = pd.read_csv('../DSS5104_TeamWork/data/national_illness.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# Z-score 
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# parameter settings
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]
seq_len = 60
pred_len = 24
train_ratio = 0.77

# model training and evaluation
summary_results = []

for param in param_list:
    mse_list, mae_list, rmse_list, mape_list, mspe_list = [], [], [], [], []
    train_times, predict_times = [], []

    for col in df_scaled.columns:
        series = df_scaled[col].values
        train_size = int(len(series) * train_ratio)
        train_series = series[:train_size]
        test_series = series[train_size:train_size + pred_len]

        try:
            train_start = time.time()
            model = ARIMA(train_series[-seq_len:], order=param)
            model_fit = model.fit()
            train_time = time.time() - train_start

            predict_start = time.time()
            forecast = model_fit.forecast(steps=pred_len)
            predict_time = time.time() - predict_start

            mse = mean_squared_error(test_series, forecast)
            mae = mean_absolute_error(test_series, forecast)
            rmse = np.sqrt(mse)
            mape = np.mean(np.abs((test_series - forecast) / test_series)) * 100
            mspe = np.mean(np.square((test_series - forecast) / test_series)) * 100

            mse_list.append(mse)
            mae_list.append(mae)
            rmse_list.append(rmse)
            mape_list.append(mape)
            mspe_list.append(mspe)
            train_times.append(train_time)
            predict_times.append(predict_time)

        except:
            continue

    summary_results.append({
        'ARIMA (p,d,q)': param,
        'MSE (mean)': np.mean(mse_list),
        'MAE (mean)': np.mean(mae_list),
        'RMSE (mean)': np.mean(rmse_list),
        'MAPE (mean)': np.mean(mape_list),
        'MSPE (mean)': np.mean(mspe_list),
        'Train Time (mean s)': np.mean(train_times),
        'Predict Time (mean s)': np.mean(predict_times)
    })

# output summary
summary_df = pd.DataFrame(summary_results)
print("✅ARIMA output summary：")
print(summary_df)

✅ARIMA output summary：
  ARIMA (p,d,q)  MSE (mean)  MAE (mean)  RMSE (mean)  MAPE (mean)  \
0     (1, 1, 0)    0.639663    0.597068     0.735715   259.343676   
1     (2, 1, 0)    0.702437    0.621719     0.766125   284.788794   
2     (1, 1, 1)    0.778307    0.649057     0.800735   300.761157   
3     (2, 1, 1)    0.778146    0.647824     0.800441   299.863246   
4     (3, 1, 2)    0.708276    0.621266     0.765755   277.591877   

   MSPE (mean)  Train Time (mean s)  Predict Time (mean s)  
0  4635.269474             0.016109               0.000964  
1  5313.260529             0.028181               0.000949  
2  5723.907070             0.049288               0.001166  
3  5681.742732             0.070122               0.001523  
4  5147.999971             0.207768               0.001829  


## dataset：Traffic

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

import time

# start time:train
train_start = time.time()
model = ARIMA(train_series[-seq_len:], order=(1, 1, 0))
model_fit = model.fit()
train_time = time.time() - train_start  #trainning time (seconds)

# start time:predict
predict_start = time.time()
forecast = model_fit.forecast(steps=pred_len)
predict_time = time.time() - predict_start  #predicting time (seconds)

# data processing
df = pd.read_csv('../DSS5104_TeamWork/data/traffic.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# Z-score 
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# parameter settings
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]
seq_len = 60
pred_len = 24
train_ratio = 0.77

# model training and evaluation
summary_results = []

for param in param_list:
    mse_list, mae_list, rmse_list, mape_list, mspe_list = [], [], [], [], []
    train_times, predict_times = [], []

    for col in df_scaled.columns:
        series = df_scaled[col].values
        train_size = int(len(series) * train_ratio)
        train_series = series[:train_size]
        test_series = series[train_size:train_size + pred_len]

        try:
            train_start = time.time()
            model = ARIMA(train_series[-seq_len:], order=param)
            model_fit = model.fit()
            train_time = time.time() - train_start

            predict_start = time.time()
            forecast = model_fit.forecast(steps=pred_len)
            predict_time = time.time() - predict_start

            mse = mean_squared_error(test_series, forecast)
            mae = mean_absolute_error(test_series, forecast)
            rmse = np.sqrt(mse)
            mape = np.mean(np.abs((test_series - forecast) / test_series)) * 100
            mspe = np.mean(np.square((test_series - forecast) / test_series)) * 100

            mse_list.append(mse)
            mae_list.append(mae)
            rmse_list.append(rmse)
            mape_list.append(mape)
            mspe_list.append(mspe)
            train_times.append(train_time)
            predict_times.append(predict_time)

        except:
            continue

    summary_results.append({
        'ARIMA (p,d,q)': param,
        'MSE (mean)': np.mean(mse_list),
        'MAE (mean)': np.mean(mae_list),
        'RMSE (mean)': np.mean(rmse_list),
        'MAPE (mean)': np.mean(mape_list),
        'MSPE (mean)': np.mean(mspe_list),
        'Train Time (mean s)': np.mean(train_times),
        'Predict Time (mean s)': np.mean(predict_times)
    })

# output summary
summary_df = pd.DataFrame(summary_results)
print("✅ARIMA output summary：")
print(summary_df)

✅ARIMA output summary：
  ARIMA (p,d,q)  MSE (mean)  MAE (mean)  RMSE (mean)  MAPE (mean)  \
0     (1, 1, 0)    0.574231    0.546732     0.676584   326.036543   
1     (2, 1, 0)    0.578212    0.549931     0.675058   319.399620   
2     (1, 1, 1)    0.568561    0.552527     0.676693   316.486161   
3     (2, 1, 1)    0.598102    0.572389     0.698529   338.077015   
4     (3, 1, 2)    0.610030    0.573860     0.705521   346.655723   

     MSPE (mean)  Train Time (mean s)  Predict Time (mean s)  
0  253343.387227             0.012464               0.001319  
1  305162.964326             0.022583               0.001275  
2  218506.911170             0.036864               0.001256  
3  255117.590053             0.065671               0.001263  
4  215842.050457             0.156434               0.001313  


## dataset：Weather

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

import time

# start time:train
train_start = time.time()
model = ARIMA(train_series[-seq_len:], order=(1, 1, 0))
model_fit = model.fit()
train_time = time.time() - train_start  #trainning time (seconds)

# start time:predict
predict_start = time.time()
forecast = model_fit.forecast(steps=pred_len)
predict_time = time.time() - predict_start  #predicting time (seconds)

# data processing
df = pd.read_csv('../DSS5104_TeamWork/data/weather.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# Z-score 
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# parameter settings
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]
seq_len = 60
pred_len = 24
train_ratio = 0.77

# model training and evaluation
summary_results = []

for param in param_list:
    mse_list, mae_list, rmse_list, mape_list, mspe_list = [], [], [], [], []
    train_times, predict_times = [], []

    for col in df_scaled.columns:
        series = df_scaled[col].values
        train_size = int(len(series) * train_ratio)
        train_series = series[:train_size]
        test_series = series[train_size:train_size + pred_len]

        try:
            train_start = time.time()
            model = ARIMA(train_series[-seq_len:], order=param)
            model_fit = model.fit()
            train_time = time.time() - train_start

            predict_start = time.time()
            forecast = model_fit.forecast(steps=pred_len)
            predict_time = time.time() - predict_start

            mse = mean_squared_error(test_series, forecast)
            mae = mean_absolute_error(test_series, forecast)
            rmse = np.sqrt(mse)
            mape = np.mean(np.abs((test_series - forecast) / test_series)) * 100
            mspe = np.mean(np.square((test_series - forecast) / test_series)) * 100

            mse_list.append(mse)
            mae_list.append(mae)
            rmse_list.append(rmse)
            mape_list.append(mape)
            mspe_list.append(mspe)
            train_times.append(train_time)
            predict_times.append(predict_time)

        except:
            continue

    summary_results.append({
        'ARIMA (p,d,q)': param,
        'MSE (mean)': np.mean(mse_list),
        'MAE (mean)': np.mean(mae_list),
        'RMSE (mean)': np.mean(rmse_list),
        'MAPE (mean)': np.mean(mape_list),
        'MSPE (mean)': np.mean(mspe_list),
        'Train Time (mean s)': np.mean(train_times),
        'Predict Time (mean s)': np.mean(predict_times)
    })

# output summary
summary_df = pd.DataFrame(summary_results)
print("✅ARIMA output summary：")
print(summary_df)

✅ARIMA output summary：
  ARIMA (p,d,q)  MSE (mean)  MAE (mean)  RMSE (mean)  MAPE (mean)  \
0     (1, 1, 0)    0.034974    0.060279     0.069774    12.919488   
1     (2, 1, 0)    0.032207    0.056784     0.066093    12.129932   
2     (1, 1, 1)    0.030943    0.051624     0.061287    11.308488   
3     (2, 1, 1)    0.031284    0.052588     0.062153    11.251691   
4     (3, 1, 2)    0.026288    0.054022     0.063327    11.852916   

   MSPE (mean)  Train Time (mean s)  Predict Time (mean s)  
0     5.407654             0.012470               0.001313  
1     5.151483             0.030963               0.001056  
2     7.520504             0.063833               0.000849  
3     4.318399             0.081198               0.001041  
4     4.696286             0.150500               0.000861  


## dataset：Exchange

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

import time

# start time:train
train_start = time.time()
model = ARIMA(train_series[-seq_len:], order=(1, 1, 0))
model_fit = model.fit()
train_time = time.time() - train_start  #trainning time (seconds)

# start time:predict
predict_start = time.time()
forecast = model_fit.forecast(steps=pred_len)
predict_time = time.time() - predict_start  #predicting time (seconds)

# data processing
df = pd.read_csv('../DSS5104_TeamWork/data/exchange_rate.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# Z-score 
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# parameter settings
param_list = [(1,1,0), (2,1,0), (1,1,1), (2,1,1), (3,1,2)]
seq_len = 60
pred_len = 24
train_ratio = 0.77

# model training and evaluation
summary_results = []

for param in param_list:
    mse_list, mae_list, rmse_list, mape_list, mspe_list = [], [], [], [], []
    train_times, predict_times = [], []

    for col in df_scaled.columns:
        series = df_scaled[col].values
        train_size = int(len(series) * train_ratio)
        train_series = series[:train_size]
        test_series = series[train_size:train_size + pred_len]

        try:
            train_start = time.time()
            model = ARIMA(train_series[-seq_len:], order=param)
            model_fit = model.fit()
            train_time = time.time() - train_start

            predict_start = time.time()
            forecast = model_fit.forecast(steps=pred_len)
            predict_time = time.time() - predict_start

            mse = mean_squared_error(test_series, forecast)
            mae = mean_absolute_error(test_series, forecast)
            rmse = np.sqrt(mse)
            mape = np.mean(np.abs((test_series - forecast) / test_series)) * 100
            mspe = np.mean(np.square((test_series - forecast) / test_series)) * 100

            mse_list.append(mse)
            mae_list.append(mae)
            rmse_list.append(rmse)
            mape_list.append(mape)
            mspe_list.append(mspe)
            train_times.append(train_time)
            predict_times.append(predict_time)

        except:
            continue

    summary_results.append({
        'ARIMA (p,d,q)': param,
        'MSE (mean)': np.mean(mse_list),
        'MAE (mean)': np.mean(mae_list),
        'RMSE (mean)': np.mean(rmse_list),
        'MAPE (mean)': np.mean(mape_list),
        'MSPE (mean)': np.mean(mspe_list),
        'Train Time (mean s)': np.mean(train_times),
        'Predict Time (mean s)': np.mean(predict_times)
    })

# output summary
summary_df = pd.DataFrame(summary_results)
print("✅ARIMA output summary：")
print(summary_df)

✅ARIMA output summary：
  ARIMA (p,d,q)  MSE (mean)  MAE (mean)  RMSE (mean)  MAPE (mean)  \
0     (1, 1, 0)    0.042029    0.166289     0.184061    16.358474   
1     (2, 1, 0)    0.041135    0.164464     0.182103    15.919832   
2     (1, 1, 1)    0.041965    0.166754     0.184517    15.920315   
3     (2, 1, 1)    0.040957    0.164124     0.181749    15.863293   
4     (3, 1, 2)    0.042987    0.168868     0.186271    16.122773   

   MSPE (mean)  Train Time (mean s)  Predict Time (mean s)  
0     7.551899             0.016375               0.002191  
1     6.922385             0.032552               0.000910  
2     6.709979             0.053335               0.001464  
3     6.852107             0.067804               0.000714  
4     6.812041             0.189027               0.000973  


# GradientBoosting

## dataset:electricity

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import time

# 1. Load and clean the data
df = pd.read_csv('../DSS5104_TeamWork/data/electricity.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. Normalize with Z-score
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# 3. Parameters
seq_len = 336
pred_len = 96
train_ratio = 0.77
metrics = []

# Start total timer
total_start = time.time()
checkpoints = []

# 4. Loop over each variable
for i, col in enumerate(df_scaled.columns):
    print(f"[{i+1}/{len(df_scaled.columns)}] Running: {col}")
    series = df_scaled[col].values

    # Create input/output samples
    X, y = [], []
    for j in range(len(series) - seq_len - pred_len):
        X.append(series[j:j+seq_len])
        y.append(series[j+seq_len:j+seq_len+pred_len])
    X, y = np.array(X), np.array(y)

    # Split into train/test sets
    train_size = int(len(X) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    try:
        start_time = time.time()

        # Train the model (1-step ahead)
        start_train = time.time()
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[:, 0])
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_pred

        y_true = y_test[:, 0]

        # Evaluation metrics
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mspe = np.mean(np.square((y_true - y_pred) / y_true)) * 100

        metrics.append({
            'Variable': col,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'MSPE (%)': mspe,
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time
        })

        elapsed = time.time() - start_time
        checkpoints.append(elapsed)

        # Print per-variable time
        print(f"✅ Done: {col} in {elapsed:.2f} sec")

        # Checkpoint every 5 variables
        if (i + 1) % 5 == 0:
            print(f"🕒 Avg time per variable (last 5): {np.mean(checkpoints[-5:]):.2f} sec")

    except Exception as e:
        print(f"❌ Skipped {col} due to error: {e}")
        continue

# End total timer
total_elapsed = time.time() - total_start

# 5. Show results
result_df = pd.DataFrame(metrics)
print("\n✅ Average performance across all variables:")
print(result_df.mean(numeric_only=True))

print(f"\n🕒 Total runtime: {total_elapsed:.2f} seconds")

[1/321] Running: 0
✅ Done: 0 in 77.25 sec
[2/321] Running: 1
✅ Done: 1 in 94.34 sec
[3/321] Running: 2
✅ Done: 2 in 44.55 sec
[4/321] Running: 3
✅ Done: 3 in 125.35 sec
[5/321] Running: 4
✅ Done: 4 in 116.04 sec
🕒 Avg time per variable (last 5): 91.50 sec
[6/321] Running: 5
✅ Done: 5 in 126.72 sec
[7/321] Running: 6
✅ Done: 6 in 73.28 sec
[8/321] Running: 7
✅ Done: 7 in 137.76 sec
[9/321] Running: 8
✅ Done: 8 in 120.08 sec
[10/321] Running: 9
✅ Done: 9 in 123.59 sec
🕒 Avg time per variable (last 5): 116.29 sec
[11/321] Running: 10
✅ Done: 10 in 107.47 sec
[12/321] Running: 11
✅ Done: 11 in 124.29 sec
[13/321] Running: 12
✅ Done: 12 in 112.05 sec
[14/321] Running: 13
✅ Done: 13 in 105.26 sec
[15/321] Running: 14
✅ Done: 14 in 109.00 sec
🕒 Avg time per variable (last 5): 111.61 sec
[16/321] Running: 15
✅ Done: 15 in 148.95 sec
[17/321] Running: 16
✅ Done: 16 in 74.99 sec
[18/321] Running: 17
✅ Done: 17 in 106.74 sec
[19/321] Running: 18
✅ Done: 18 in 137.30 sec
[20/321] Running: 19
✅ Don

KeyboardInterrupt: 

## dataset:ILI

In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import time

# 1. Load and clean the data
df = pd.read_csv('../DSS5104_TeamWork/data/national_illness.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. Normalize with Z-score
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# 3. Parameters
seq_len = 60
pred_len = 60
train_ratio = 0.77
metrics = []

# Start total timer
total_start = time.time()
checkpoints = []

# 4. Loop over each variable
for i, col in enumerate(df_scaled.columns):
    print(f"[{i+1}/{len(df_scaled.columns)}] Running: {col}")
    series = df_scaled[col].values

    # Create input/output samples
    X, y = [], []
    for j in range(len(series) - seq_len - pred_len):
        X.append(series[j:j+seq_len])
        y.append(series[j+seq_len:j+seq_len+pred_len])
    X, y = np.array(X), np.array(y)

    # Split into train/test sets
    train_size = int(len(X) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    try:
        start_time = time.time()

        # Train the model (1-step ahead)
        start_train = time.time()
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[:, 0])
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_pred

        y_true = y_test[:, 0]

        # Evaluation metrics
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mspe = np.mean(np.square((y_true - y_pred) / y_true)) * 100

        metrics.append({
            'Variable': col,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'MSPE (%)': mspe,
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time
        })

        elapsed = time.time() - start_time
        checkpoints.append(elapsed)

        # Print per-variable time
        print(f"✅ Done: {col} in {elapsed:.2f} sec")

        # Checkpoint every 5 variables
        if (i + 1) % 5 == 0:
            print(f"🕒 Avg time per variable (last 5): {np.mean(checkpoints[-5:]):.2f} sec")

    except Exception as e:
        print(f"❌ Skipped {col} due to error: {e}")
        continue

# End total timer
total_elapsed = time.time() - total_start

# 5. Show results
result_df = pd.DataFrame(metrics)
print("\n✅ Average performance across all variables:")
print(result_df.mean(numeric_only=True))

print(f"\n🕒 Total runtime: {total_elapsed:.2f} seconds")


[1/7] Running: % WEIGHTED ILI
✅ Done: % WEIGHTED ILI in 1.59 sec
[2/7] Running: %UNWEIGHTED ILI
✅ Done: %UNWEIGHTED ILI in 1.54 sec
[3/7] Running: AGE 0-4
✅ Done: AGE 0-4 in 1.49 sec
[4/7] Running: AGE 5-24
✅ Done: AGE 5-24 in 1.65 sec
[5/7] Running: ILITOTAL
✅ Done: ILITOTAL in 1.57 sec
🕒 Avg time per variable (last 5): 1.57 sec
[6/7] Running: NUM. OF PROVIDERS
✅ Done: NUM. OF PROVIDERS in 1.37 sec
[7/7] Running: OT
✅ Done: OT in 1.44 sec

✅ Average performance across all variables:
MSE                   0.593648
MAE                   0.385216
RMSE                  0.732177
MAPE (%)             55.317933
MSPE (%)            943.581557
Train Time (s)        1.521770
Predict Time (s)      0.001505
dtype: float64

🕒 Total runtime: 10.68 seconds


## dataset:Traffic

In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import time

# 1. Load and clean the data
df = pd.read_csv('../DSS5104_TeamWork/data/traffic.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. Normalize with Z-score
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# 3. Parameters
seq_len = 96
pred_len = 96
train_ratio = 0.77
metrics = []

# Start total timer
total_start = time.time()
checkpoints = []

# 4. Loop over each variable
for i, col in enumerate(df_scaled.columns):
    print(f"[{i+1}/{len(df_scaled.columns)}] Running: {col}")
    series = df_scaled[col].values

    # Create input/output samples
    X, y = [], []
    for j in range(len(series) - seq_len - pred_len):
        X.append(series[j:j+seq_len])
        y.append(series[j+seq_len:j+seq_len+pred_len])
    X, y = np.array(X), np.array(y)

    # Split into train/test sets
    train_size = int(len(X) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    try:
        start_time = time.time()

        # Train the model (1-step ahead)
        start_train = time.time()
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[:, 0])
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_pred

        y_true = y_test[:, 0]

        # Evaluation metrics
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mspe = np.mean(np.square((y_true - y_pred) / y_true)) * 100

        metrics.append({
            'Variable': col,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'MSPE (%)': mspe,
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time
        })

        elapsed = time.time() - start_time
        checkpoints.append(elapsed)

        # Print per-variable time
        print(f"✅ Done: {col} in {elapsed:.2f} sec")

        # Checkpoint every 5 variables
        if (i + 1) % 5 == 0:
            print(f"🕒 Avg time per variable (last 5): {np.mean(checkpoints[-5:]):.2f} sec")

    except Exception as e:
        print(f"❌ Skipped {col} due to error: {e}")
        continue

# End total timer
total_elapsed = time.time() - total_start

# 5. Show results
result_df = pd.DataFrame(metrics)
print("\n✅ Average performance across all variables:")
print(result_df.mean(numeric_only=True))

print(f"\n🕒 Total runtime: {total_elapsed:.2f} seconds")


[1/862] Running: 0
✅ Done: 0 in 31.41 sec
[2/862] Running: 1


KeyboardInterrupt: 

## dataset:Weather

In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import time

# 1. Load and clean the data
df = pd.read_csv('../DSS5104_TeamWork/data/weather.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. Normalize with Z-score
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# 3. Parameters
seq_len = 336
pred_len = 96
train_ratio = 0.77
metrics = []

# Start total timer
total_start = time.time()
checkpoints = []

# 4. Loop over each variable
for i, col in enumerate(df_scaled.columns):
    print(f"[{i+1}/{len(df_scaled.columns)}] Running: {col}")
    series = df_scaled[col].values

    # Create input/output samples
    X, y = [], []
    for j in range(len(series) - seq_len - pred_len):
        X.append(series[j:j+seq_len])
        y.append(series[j+seq_len:j+seq_len+pred_len])
    X, y = np.array(X), np.array(y)

    # Split into train/test sets
    train_size = int(len(X) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    try:
        start_time = time.time()

        # Train the model (1-step ahead)
        start_train = time.time()
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[:, 0])
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_pred

        y_true = y_test[:, 0]

        # Evaluation metrics
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mspe = np.mean(np.square((y_true - y_pred) / y_true)) * 100

        metrics.append({
            'Variable': col,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'MSPE (%)': mspe,
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time
        })

        elapsed = time.time() - start_time
        checkpoints.append(elapsed)

        # Print per-variable time
        print(f"✅ Done: {col} in {elapsed:.2f} sec")

        # Checkpoint every 5 variables
        if (i + 1) % 5 == 0:
            print(f"🕒 Avg time per variable (last 5): {np.mean(checkpoints[-5:]):.2f} sec")

    except Exception as e:
        print(f"❌ Skipped {col} due to error: {e}")
        continue

# End total timer
total_elapsed = time.time() - total_start

# 5. Show results
result_df = pd.DataFrame(metrics)
print("\n✅ Average performance across all variables:")
print(result_df.mean(numeric_only=True))

print(f"\n🕒 Total runtime: {total_elapsed:.2f} seconds")


[1/21] Running: p (mbar)
✅ Done: p (mbar) in 352.15 sec
[2/21] Running: T (degC)
✅ Done: T (degC) in 349.48 sec
[3/21] Running: Tpot (K)
✅ Done: Tpot (K) in 520.73 sec
[4/21] Running: Tdew (degC)
✅ Done: Tdew (degC) in 469.11 sec
[5/21] Running: rh (%)
✅ Done: rh (%) in 573.36 sec
🕒 Avg time per variable (last 5): 452.96 sec
[6/21] Running: VPmax (mbar)
✅ Done: VPmax (mbar) in 524.77 sec
[7/21] Running: VPact (mbar)
✅ Done: VPact (mbar) in 462.84 sec
[8/21] Running: VPdef (mbar)
✅ Done: VPdef (mbar) in 492.70 sec
[9/21] Running: sh (g/kg)
✅ Done: sh (g/kg) in 410.50 sec
[10/21] Running: H2OC (mmol/mol)
✅ Done: H2OC (mmol/mol) in 448.48 sec
🕒 Avg time per variable (last 5): 467.86 sec
[11/21] Running: rho (g/m**3)
✅ Done: rho (g/m**3) in 756.14 sec
[12/21] Running: wv (m/s)
✅ Done: wv (m/s) in 329.11 sec
[13/21] Running: max. wv (m/s)
✅ Done: max. wv (m/s) in 279.26 sec
[14/21] Running: wd (deg)
✅ Done: wd (deg) in 395.38 sec
[15/21] Running: rain (mm)
✅ Done: rain (mm) in 40.39 sec
🕒 A

## dataset:Exchange

In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import time

# 1. Load and clean the data
df = pd.read_csv('../DSS5104_TeamWork/data/exchange_rate.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. Normalize with Z-score
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# 3. Parameters
seq_len = 96
pred_len = 720
train_ratio = 0.77
metrics = []

# Start total timer
total_start = time.time()
checkpoints = []

# 4. Loop over each variable
for i, col in enumerate(df_scaled.columns):
    print(f"[{i+1}/{len(df_scaled.columns)}] Running: {col}")
    series = df_scaled[col].values

    # Create input/output samples
    X, y = [], []
    for j in range(len(series) - seq_len - pred_len):
        X.append(series[j:j+seq_len])
        y.append(series[j+seq_len:j+seq_len+pred_len])
    X, y = np.array(X), np.array(y)

    # Split into train/test sets
    train_size = int(len(X) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    try:
        start_time = time.time()

        # Train the model (1-step ahead)
        start_train = time.time()
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[:, 0])
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_pred

        y_true = y_test[:, 0]

        # Evaluation metrics
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mspe = np.mean(np.square((y_true - y_pred) / y_true)) * 100

        metrics.append({
            'Variable': col,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'MSPE (%)': mspe,
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time
        })

        elapsed = time.time() - start_time
        checkpoints.append(elapsed)

        # Print per-variable time
        print(f"✅ Done: {col} in {elapsed:.2f} sec")

        # Checkpoint every 5 variables
        if (i + 1) % 5 == 0:
            print(f"🕒 Avg time per variable (last 5): {np.mean(checkpoints[-5:]):.2f} sec")

    except Exception as e:
        print(f"❌ Skipped {col} due to error: {e}")
        continue

# End total timer
total_elapsed = time.time() - total_start

# 5. Show results
result_df = pd.DataFrame(metrics)
print("\n✅ Average performance across all variables:")
print(result_df.mean(numeric_only=True))

print(f"\n🕒 Total runtime: {total_elapsed:.2f} seconds")


[1/8] Running: 0
✅ Done: 0 in 18.20 sec
[2/8] Running: 1
✅ Done: 1 in 18.43 sec
[3/8] Running: 2
✅ Done: 2 in 17.95 sec
[4/8] Running: 3
✅ Done: 3 in 18.09 sec
[5/8] Running: 4
✅ Done: 4 in 10.55 sec
🕒 Avg time per variable (last 5): 16.64 sec
[6/8] Running: 5
✅ Done: 5 in 15.72 sec
[7/8] Running: 6
✅ Done: 6 in 16.51 sec
[8/8] Running: OT
✅ Done: OT in 17.38 sec

✅ Average performance across all variables:
MSE                  0.152648
MAE                  0.253750
RMSE                 0.319821
MAPE (%)            22.788771
MSPE (%)            78.128174
Train Time (s)      16.599721
Predict Time (s)     0.001863
dtype: float64

🕒 Total runtime: 132.99 seconds
