In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import joblib
from sklearn.model_selection import TimeSeriesSplit
import optuna 
from sklearn.model_selection import cross_val_score


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
!pip install tabpfn-client





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
def clean_column_names(df):
    """Clean column names to be compatible with XGBoost"""
    df = df.copy()
    # Remove brackets and clean special characters
    df.columns = (df.columns
                 .str.replace('[', '')
                 .str.replace(']', '')
                 .str.replace(' ', '_')
                 .str.replace('(', '')
                 .str.replace(')', '')
                 .str.replace('ö', 'oe')  # Handle German special characters
                 .str.replace('ä', 'ae')
                 .str.replace('ü', 'ue'))
    return df

def create_lagged_features(df, target_col, lag_hours=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 24, 48, 72, 168, 336, 672, 1008, 2016, 8760]):
    """Create additional lagged features for time series data"""
    df_copy = df.copy()
    
    # Create lags for a variety of time periods (e.g., 1h, 2h, 1 day, 1 week, etc.)
    for lag in lag_hours:
        df_copy[f'{target_col}_lag_{lag}h'] = df_copy[target_col].shift(lag)
        
    # Drop rows with NaN values created by lagging
    df_copy = df_copy.dropna()
    
    return df_copy


def preprocess_load_data(data, split_date='2023-09-30'):
    """Preprocess the load data including lagged features"""
    df = data.copy()

    # Convert Date column to datetime and sort
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')

    # Create lagged features before cleaning column names
    df = create_lagged_features(df, target_col='Gesamt (Netzlast) [MWh] Berechnete Auflösungen')

    # Clean column names
    df = clean_column_names(df)

    # Rename target column
    df = df.rename(columns={'Gesamt_Netzlast_MWh_Berechnete_Aufloesungen': 'load'})

    # Split data based on date
    train_data = df[df['Date'] < split_date].tail(5_000)  # Keep only the last 10,000 rows
    test_data = df[df['Date'] >= split_date]

    # Define feature columns
    base_features = [
        'hour', 'dayofyear_cos', 'dayofweek', 'dayofweek_sin',
        'is_workday', 'hour_cos', 'date_offset', 'dayofyear',
        'Kernenergie_MWh_Berechnete_Aufloesungen',
        'Steinkohle_MWh_Berechnete_Aufloesungen',
        'Holiday_Not_a_Holiday', 'hour_sin',
        'Wind_Onshore_MWh_Berechnete_Aufloesungen'
    ]
    lag_features = [col for col in df.columns if 'lag' in col]
    feature_columns = base_features + lag_features

    # Extract features and target
    X_train = train_data[feature_columns]
    y_train = train_data['load']
    X_test = test_data[feature_columns]
    y_test = test_data['load']

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert back to DataFrame
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_columns)

    print(f"\nTraining data shape: {X_train_scaled.shape}")
    print(f"Testing data shape: {X_test_scaled.shape}")
    print(f"Training period: {train_data['Date'].min()} to {train_data['Date'].max()}")
    print(f"Testing period: {test_data['Date'].min()} to {test_data['Date'].max()}")

    return X_train_scaled, X_test_scaled, y_train, y_test, feature_columns


In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('../Data/selected_features.csv')

# Assuming 'data' is your DataFrame containing the energy load data
X_train_scaled, X_test_scaled, y_train, y_test, feature_columns = preprocess_load_data(data)



Training data shape: (5000, 34)
Testing data shape: (9541, 34)
Training period: 2023-03-05 15:00:00 to 2023-09-29 23:00:00
Testing period: 2023-09-30 00:00:00 to 2024-10-30 23:00:00


In [15]:
from tabpfn_client import init, TabPFNClassifier, TabPFNRegressor

# Initialize the TabPFN Regressor
model = TabPFNRegressor()

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)


Processing: 100%|██████████| [00:35<00:00]


In [18]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Calculate MAE and MAPE
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape}")  # Convert to percentage format

MAE: 498.971672829761
MAPE: 0.009259450696184304
