In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from autosklearn.regression import AutoSklearnRegressor
import warnings
warnings.filterwarnings("ignore")
from autosklearn.metrics import mean_squared_error


In [None]:

script_dir = os.getcwd()  

data_folder = os.path.join(script_dir, "..", "data")       # ../data
results_folder = os.path.join(script_dir, "..", "results") # ../results

os.makedirs(results_folder, exist_ok=True)

# check data folder exists
if not os.path.exists(data_folder):
    raise FileNotFoundError(f"Data folder not found: {data_folder}")


In [None]:
# find 12 csv files
csv_files = [f for f in os.listdir(data_folder) if f.startswith("PRSA_Data_") and f.endswith(".csv")]
print(f"Found {len(csv_files)} stations\n")

Found 12 stations



In [5]:
def create_supervised(df, seq_len=7, horizon=1):
    """
    Converts a series into supervised learning X/y pairs.
    """
    X, y = [], []
    for i in range(len(df) - seq_len - horizon + 1):
        X.append(df.iloc[i:i+seq_len].values.flatten())
        y.append(df.iloc[i+seq_len:i+seq_len+horizon].values.flatten())
    return pd.DataFrame(X), pd.DataFrame(y)

In [13]:
def train_rso(X_train, y_train, time_budget=180, n_jobs=2, seed=42):
    """
    Train an Auto-Sklearn regression model.
    """
    model = AutoSklearnRegressor(
        time_left_for_this_task=time_budget,
        per_run_time_limit=30,
        n_jobs=n_jobs,
        seed=seed,
        metric=mean_squared_error
    )
    model.fit(X_train, y_train.values.ravel())  # flatten y
    return model

In [None]:
# pick one file of 12
file = csv_files[0]  # changable index
station = file.split("_")[2].split("_2013")[0]
print(f"Processing station: {station}")


Processing station: Aotizhongxin


In [18]:
df = pd.read_csv(os.path.join(data_folder, file))
df['date'] = pd.to_datetime(df[['year','month','day','hour']])
df = df.set_index('date')
series = df['PM2.5'].resample('D').mean().ffill().bfill()

In [None]:
# train/test split
train_series = series[:-365]
test_series  = series[-365:]

# supervised sequences
seq_len = 3  # smaller for memory
X_train, y_train = create_supervised(train_series.to_frame('PM2.5'), seq_len=seq_len)
X_test, y_test   = create_supervised(pd.concat([train_series[-seq_len:], test_series]).to_frame('PM2.5'), seq_len=seq_len)

In [None]:
# train
model = train_rso(X_train, y_train)  

[ERROR] [2025-11-26 21:09:25,837:Client-AutoML(42):cd3f8f85-cb03-11f0-8556-4cd71793f92b] (" Dummy prediction failed with run state StatusType.MEMOUT and additional output: {'error': 'Memout (used more than 3072 MB).', 'configuration_origin': 'DUMMY'}.",)
[ERROR] [2025-11-26 21:09:25,837:Client-AutoML(42):cd3f8f85-cb03-11f0-8556-4cd71793f92b] (" Dummy prediction failed with run state StatusType.MEMOUT and additional output: {'error': 'Memout (used more than 3072 MB).', 'configuration_origin': 'DUMMY'}.",)
Traceback (most recent call last):
  File "/vol/home/s3617238/miniconda3/envs/autosklearn_env/lib/python3.9/site-packages/autosklearn/automl.py", line 765, in fit
    self._do_dummy_prediction()
  File "/vol/home/s3617238/miniconda3/envs/autosklearn_env/lib/python3.9/site-packages/autosklearn/automl.py", line 489, in _do_dummy_prediction
    raise ValueError(msg)
ValueError: (" Dummy prediction failed with run state StatusType.MEMOUT and additional output: {'error': 'Memout (used more 

ValueError: (" Dummy prediction failed with run state StatusType.MEMOUT and additional output: {'error': 'Memout (used more than 3072 MB).', 'configuration_origin': 'DUMMY'}.",)

In [None]:
forecast = model.predict(X_test)
forecast = pd.Series(forecast[-len(test_series):], index=test_series.index)

In [None]:
# metrics
mse = mean_squared_error(test_series, forecast)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_series, forecast)
print(f"{station} -> RMSE: {rmse:.1f}, MAE: {mae:.1f}")

In [None]:
# plot
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
plt.plot(train_series[-730:], label="Train (last 2 yrs)", color="gray")
plt.plot(test_series, label="Actual", color="blue")
plt.plot(forecast, label="RSO Forecast", color="red")
plt.legend()
plt.show()