In [12]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error

# Showing values upto 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format)

path = '/Users/umeshnagar/Downloads/Assessment 2/'
# Load datasets
train_df = pd.read_csv(path + "train.csv", parse_dates=["timestamp"])
test_df = pd.read_csv(path + "test.csv", parse_dates=["timestamp"])


In [13]:
train_df.head()

Unnamed: 0,timestamp,energy_output,irradiance_global_index,irradiance_global_reference,irradiance_horizontal,module_temperature_1,module_temperature_2,module_temperature_3,wind_direction,relative_humidity,...,incident_radiation_1,incident_radiation_2,incident_radiation_4,incident_radiation_3,reflected_radiation_1,reflected_radiation_2,reflected_radiation_4,reflected_radiation_3,ambient_temperature,wind_speed
0,2024-10-01 00:00:00,0.0,0.0,0.0,0.0,22.14,21.77,21.89,166.49,100.0,...,0.0,,0.0,,0.0,,0.0,,23.45,0.0
1,2024-10-01 01:00:00,0.0,0.0,0.0,0.0,21.9,21.49,21.65,257.27,100.0,...,0.0,,0.0,,0.0,,0.0,,23.22,0.0
2,2024-10-01 02:00:00,0.0,0.0,0.0,0.0,22.54,22.29,22.32,212.57,100.0,...,0.0,,0.0,,0.0,,0.0,,23.3,0.0
3,2024-10-01 03:00:00,0.0,0.0,0.0,0.0,22.69,22.51,22.48,157.93,100.0,...,0.0,,0.0,,0.0,,0.0,,23.35,1.83
4,2024-10-01 04:00:00,0.0,0.0,0.0,0.0,23.16,22.86,22.86,142.27,100.0,...,0.0,,0.0,,0.0,,0.0,,23.51,0.0


In [14]:
def add_cyclic_feature(df):
    df['hour'] = df['timestamp'].dt.hour  

    # Apply sine and cosine transformations
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    return df

train_df = add_cyclic_feature(train_df)
test_df = add_cyclic_feature(test_df)

In [15]:
train_df.head()

Unnamed: 0,timestamp,energy_output,irradiance_global_index,irradiance_global_reference,irradiance_horizontal,module_temperature_1,module_temperature_2,module_temperature_3,wind_direction,relative_humidity,...,incident_radiation_3,reflected_radiation_1,reflected_radiation_2,reflected_radiation_4,reflected_radiation_3,ambient_temperature,wind_speed,hour,hour_sin,hour_cos
0,2024-10-01 00:00:00,0.0,0.0,0.0,0.0,22.14,21.77,21.89,166.49,100.0,...,,0.0,,0.0,,23.45,0.0,0,0.0,1.0
1,2024-10-01 01:00:00,0.0,0.0,0.0,0.0,21.9,21.49,21.65,257.27,100.0,...,,0.0,,0.0,,23.22,0.0,1,0.26,0.97
2,2024-10-01 02:00:00,0.0,0.0,0.0,0.0,22.54,22.29,22.32,212.57,100.0,...,,0.0,,0.0,,23.3,0.0,2,0.5,0.87
3,2024-10-01 03:00:00,0.0,0.0,0.0,0.0,22.69,22.51,22.48,157.93,100.0,...,,0.0,,0.0,,23.35,1.83,3,0.71,0.71
4,2024-10-01 04:00:00,0.0,0.0,0.0,0.0,23.16,22.86,22.86,142.27,100.0,...,,0.0,,0.0,,23.51,0.0,4,0.87,0.5


In [16]:
def add_lag_features(df):
    # Creating Target Variables (1 to 6-hour ahead)
    for h in range(1, 7):  
        df[f'target_{h}'] = df["energy_output"].shift(h)

    # Drop NaN values caused by shifting
    df.dropna(inplace=True)
    df.drop('hour',axis = 1,inplace=True)
    return df

train_df = add_lag_features(train_df)

In [17]:
train_df.head()

Unnamed: 0,timestamp,energy_output,irradiance_global_index,irradiance_global_reference,irradiance_horizontal,module_temperature_1,module_temperature_2,module_temperature_3,wind_direction,relative_humidity,...,ambient_temperature,wind_speed,hour_sin,hour_cos,target_1,target_2,target_3,target_4,target_5,target_6
830,2024-11-04 16:00:00,11244.0,162.46,20.71,134.27,34.58,34.46,34.1,132.23,82.83,...,28.89,2.5,-0.87,-0.5,51366.0,62059.0,54025.0,47719.0,35348.0,20918.0
870,2024-11-06 08:00:00,48033.0,677.28,43.6,503.01,48.11,43.22,41.25,181.39,95.22,...,27.65,3.57,0.87,-0.5,37260.0,6825.0,51.0,0.0,0.0,0.0
877,2024-11-06 15:00:00,6675.0,100.83,11.78,82.6,29.84,29.79,29.76,162.54,86.73,...,27.92,3.25,-0.71,-0.71,53954.0,38172.0,54366.0,58593.0,55180.0,54721.0
878,2024-11-06 16:00:00,12553.0,171.66,12.24,106.15,31.69,30.48,30.95,221.32,97.76,...,27.44,2.18,-0.87,-0.5,6675.0,53954.0,38172.0,54366.0,58593.0,55180.0
1015,2024-11-12 09:00:00,38760.0,452.8,35.92,399.04,39.66,38.28,38.25,120.79,91.37,...,26.99,3.2,0.71,-0.71,60171.0,18852.0,5193.0,61.0,0.0,0.0


In [18]:
def prepare_input(train_df, test_df):
    train_df = train_df.fillna(train_df.mean())
    test_df = test_df.fillna(test_df.mean())

    
    # Select Features (Exclude target variables)
    features = [col for col in train_df.columns if col not in ["timestamp", "energy_output"] + [f'target_{h}' for h in range(1, 7)]]

    # Prepare input variables
    X_train = train_df[features]
    X_test_eval = test_df[features]  # Use test set only for evaluation
    
     # Standardize features (not mandatory for xgboost)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test_eval = scaler.transform(X_test_eval)
    
    return X_train, X_test_eval

X_train,  X_test_eval = prepare_input(train_df, test_df)
    
    

In [20]:
# Dictionary to store models & predictions
def prepar_output(df):
    models = {}
    predictions = pd.DataFrame({"timestamp": df["timestamp"], "month": df["timestamp"].dt.strftime("%Y-%m")})
    return predictions, models

predictions, models = prepar_output(test_df)

In [21]:
predictions.head()

Unnamed: 0,timestamp,month
0,2025-01-01 00:00:00,2025-01
1,2025-01-01 01:00:00,2025-01
2,2025-01-01 02:00:00,2025-01
3,2025-01-01 03:00:00,2025-01
4,2025-01-01 04:00:00,2025-01


In [22]:
def training_model(train_df, test_df):
    # Train separate models for each hour ahead
    for h in range(1, 7):
        y_train = train_df[f'target_{h}']

        # Train XGBoost model for h-hour ahead forecast
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
        model.fit(X_train, y_train)

        # Store model
        models[h] = model

        # Predict for test set
        predictions[f"{h}_hour_ahead"] = model.predict(X_test_eval)

#     predictions['target'] = np.where(test_df['energy_output'] == 0, 1, test_df['energy_output'])

    test_df["energy_output"] = np.where(test_df['energy_output'] == 0, 1, test_df['energy_output'])
    return predictions, test_df

predictions, test_df = training_model(train_df, test_df)

In [23]:
predictions.head()

Unnamed: 0,timestamp,month,1_hour_ahead,2_hour_ahead,3_hour_ahead,4_hour_ahead,5_hour_ahead,6_hour_ahead
0,2025-01-01 00:00:00,2025-01,13031.87,29131.57,43039.71,43026.2,5632.76,8742.29
1,2025-01-01 01:00:00,2025-01,13031.87,29131.57,43035.98,39517.9,5632.76,8742.29
2,2025-01-01 02:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07
3,2025-01-01 03:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07
4,2025-01-01 04:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07


In [24]:
def get_hourly_MAPE(test_df, predictions):
    # Compute MAPE for each forecast horizon
    for h in range(1, 7):
        predictions[f"MAPE_{h}_hour_ahead"] = mean_absolute_percentage_error(test_df["energy_output"].fillna(1), predictions[f"{h}_hour_ahead"])
    return predictions
predictions = get_hourly_MAPE(test_df, predictions)


In [25]:
def get_monthly_mape(predictions):
    # Calculate Monthly Average MAPE (expanded format)
    monthly_mape = predictions.groupby("month")[[f"MAPE_{h}_hour_ahead" for h in range(1, 7)]].mean()
    return monthly_mape

monthly_mape = get_monthly_mape(predictions)


In [26]:
monthly_mape.head()

Unnamed: 0_level_0,MAPE_1_hour_ahead,MAPE_2_hour_ahead,MAPE_3_hour_ahead,MAPE_4_hour_ahead,MAPE_5_hour_ahead,MAPE_6_hour_ahead
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
2025-02,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3


In [27]:
# Save results
predictions.to_csv("multi_step_forecast.csv", index=False)
monthly_mape.to_csv("monthly_mape_results.csv", index=True)


In [28]:
monthly_mape.head()

Unnamed: 0_level_0,MAPE_1_hour_ahead,MAPE_2_hour_ahead,MAPE_3_hour_ahead,MAPE_4_hour_ahead,MAPE_5_hour_ahead,MAPE_6_hour_ahead
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
2025-02,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3


In [29]:
predictions.head()

Unnamed: 0,timestamp,month,1_hour_ahead,2_hour_ahead,3_hour_ahead,4_hour_ahead,5_hour_ahead,6_hour_ahead,MAPE_1_hour_ahead,MAPE_2_hour_ahead,MAPE_3_hour_ahead,MAPE_4_hour_ahead,MAPE_5_hour_ahead,MAPE_6_hour_ahead
0,2025-01-01 00:00:00,2025-01,13031.87,29131.57,43039.71,43026.2,5632.76,8742.29,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
1,2025-01-01 01:00:00,2025-01,13031.87,29131.57,43035.98,39517.9,5632.76,8742.29,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
2,2025-01-01 02:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
3,2025-01-01 03:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
4,2025-01-01 04:00:00,2025-01,12311.6,29977.55,42804.07,39517.9,15556.57,10523.07,17304.76,15057.56,21797.27,20763.83,5128.02,4741.3
