In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
import xgboost as xgb
import plotly.express as px
from pathlib import Path
import pandas as pd
import numpy as np
import holidays
import warnings

warnings.filterwarnings('ignore')

In [2]:
def upload_data():
    data_path = str(Path.cwd().parent) + "\\Data\\EPC\\Power Consumption Data.csv"
    
    df = pd.read_csv(data_path)
    
    df = df[df["real_consumption"] > 0]
    df = df[df['real_consumption'] <= df['real_consumption'].mean() + 4 * df['real_consumption'].std()]
    
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by='time',ascending=True)
    
    return df

def data_metrics(data, real, predicted):

    y_true = data[real]
    y_pred = data[predicted]

    # Calculate metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)


    # MAE (Mean Absolute Error):
    # Lower values are better; good MAE depends on the scale of 'real_consumption'.
    # As a rule of thumb, MAE should be significantly smaller than the mean of the target variable.
    # Lower is better. Ideally, MAE should be much less than the average value of y_true.
    print(f"MAE: {mae:.4f}")

    # MSE (Mean Squared Error):
    # Similar to MAE but penalizes large errors more heavily. A smaller MSE is better.
    # Compare MSE to the variance of 'real_consumption' for context.
    # Lower is better. MSE should ideally be close to zero relative to the variance of y_true.
    print(f"MSE: {mse:.4f}")



    # RMSE (Root Mean Squared Error):
    # RMSE is the square root of MSE and is in the same units as 'real_consumption'.
    # A good RMSE is often close to the standard deviation of 'real_consumption'.
    # Lower is better. RMSE should be comparable to or less than the standard deviation of y_true."
    print(f"RMSE: {rmse:.4f}")



    # R² (Coefficient of Determination):
    # R² measures how well the predictions explain the variability of the data.
    # Values close to 1.0 are excellent, indicating the model explains most of the variance.
    # Negative values indicate poor fit.
    # Closer to 1.0 is better. Values > 0.7 are generally good; < 0.5 indicates underfitting.
    print(f"R²: {r2:.4f}")

def feature_engineering(data):
    """
    Function to generate time-based, statistical, and external features
    for power consumption forecasting with 3-minute intervals.
    """

    # Ensure 'time' is in datetime format
    data['time'] = pd.to_datetime(data['time'])

    # Basic time-based features
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['day_of_week'] = data['time'].dt.dayofweek
    data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
    data['day_of_month'] = data['time'].dt.day
    data['week_of_year'] = data['time'].dt.isocalendar().week.astype(int)
    data['month'] = data['time'].dt.month
    data['quarter'] = data['time'].dt.quarter
    data['year'] = data['time'].dt.year

    # Business-related time features
    data['is_business_hour'] = ((data['hour'] >= 9) & (data['hour'] <= 18)).astype(int)
    data['is_workday'] = ((data['is_weekend'] == 0) & (data['year'] >= 2020)).astype(int)

    # Cyclical time features
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    data['week_of_year_sin'] = np.sin(2 * np.pi * data['week_of_year'] / 52)
    data['week_of_year_cos'] = np.cos(2 * np.pi * data['week_of_year'] / 52)

    # Time intervals (3-minute intervals per hour = 20, per day = 480)
    steps_per_hour = 20
    steps_per_day = 24 * steps_per_hour

    # Lag features (for past 4 time steps at 3-minute intervals)
    for lag in [1, 2, 3, 4, 6, 12, steps_per_hour, steps_per_hour * 2]:  # 3 min, 9 min, 18 min, 36 min, 1 hour, 2 hours
        data[f'lag_{lag}'] = data['real_consumption'].shift(lag)

    # Exponential moving averages
    for span in [3,5,steps_per_hour, steps_per_hour * 3]:  # 1-hour and 3-hour EMA
        data[f'ema_{span}'] = data['real_consumption'].ewm(span=span, adjust=False).mean()


    # Rolling statistical features
    data['rolling_std_1h'] = data['real_consumption'].rolling(window=steps_per_hour, min_periods=1).std()
    data['rolling_mean_3h'] = data['real_consumption'].rolling(window=steps_per_hour * 3, min_periods=1).mean()
    data['rolling_mean_6h'] = data['real_consumption'].rolling(window=steps_per_hour * 6, min_periods=1).mean()
    data['rolling_std_6h'] = data['real_consumption'].rolling(window=steps_per_hour * 6, min_periods=1).std()

    # Weekly & Monthly moving averages
    data['weekly_avg'] = data['real_consumption'].rolling(window=steps_per_day * 7, min_periods=1).mean()
    data['monthly_avg'] = data['real_consumption'].rolling(window=steps_per_day * 30, min_periods=1).mean()

    # Percentage change in real consumption
    data['pct_change'] = data['real_consumption'].pct_change()

    # Demand Features
    data['abs_diff'] = data['real_consumption'].diff().abs()
    data['consumption_trend'] = np.sign(data['real_consumption'].diff())
    data['diff_3min'] = data['real_consumption'].diff(1)
    data['diff_1h'] = data['real_consumption'].diff(steps_per_hour)
    data['diff_24h'] = data['real_consumption'].diff(steps_per_day)

    # Previous day's total consumption (shifted by 1 day)
    data['prev_day_consumption'] = data['real_consumption'].shift(steps_per_day)

    # Weekend vs Weekday Consumption Averages
    data['weekend_vs_weekday_consumption'] = data.groupby(['is_weekend'])['real_consumption'].transform(lambda x: x.mean())

    # Holiday Features (for Georgia)
    georgia_holidays = holidays.Georgia(years=range(data["year"].min(), data["year"].max() + 1))
    data["date"] = data["time"].dt.date
    data['is_holiday'] = data["date"].map(lambda x: 1 if x in georgia_holidays else 0)
    data['is_day_before_holiday'] = data["date"].map(lambda x: 1 if (x - pd.Timedelta(days=1)) in georgia_holidays else 0)
    data['is_day_after_holiday'] = data["date"].map(lambda x: 1 if (x + pd.Timedelta(days=1)) in georgia_holidays else 0)

    # Calendar-based Features
    data['is_end_of_month'] = data['day_of_month'].apply(lambda x: 1 if x >= 28 else 0)
    data['is_payday'] = data['day_of_month'].apply(lambda x: 1 if x in [1, 15] else 0)

    return data



In [3]:
df = upload_data()

data_metrics(data=df, real="real_consumption", predicted="predicted_consumption")

df = feature_engineering(df).dropna()


MAE: 56.7256
MSE: 5824.3342
RMSE: 76.3173
R²: 0.9176


In [None]:
X = df.drop(columns=[ 'time', 'real_consumption', 'predicted_consumption','date',"ema_3","ema_20","ema_60",
                      'ema_5','lag_1','lag_2','lag_3','lag_4','lag_6','lag_12','lag_20','year'])  # Features
y = df['real_consumption']  # Target variable


# Train an XGBoost model (assuming you have train_X and train_y)
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
model.fit(X, y)

# Extract feature importance as a dictionary
importance_dict = model.get_booster().get_score(importance_type="gain")

# Convert to DataFrame
feature_importance_df = pd.DataFrame(importance_dict.items(), columns=["Feature", "Importance"])
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False).reset_index(drop=True)

# Get feature importance
xgb.plot_importance(model, importance_type="gain", max_num_features=15)

feature_importance_df

In [4]:
last_5_months_start = df['time'].max() - pd.DateOffset(months=5)
test_mask = df['time'] >= last_5_months_start

last_5_months_df = df[test_mask]
last_5_months_df = last_5_months_df[1:]
df = df[~test_mask]



In [5]:
df = df[["time","real_consumption","predicted_consumption","lag_1","ema_3","prev_day_consumption",
         "rolling_mean_3h","weekly_avg","month_sin","monthly_avg","hour","rolling_std_1h",
         'month',"pct_change"
         ]]

# Step 2: Separate features (X) and target variable (y)
X = df.drop(columns=[ 'time', 'real_consumption', 'predicted_consumption'])  # Features
y = df['real_consumption']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42,shuffle=True)

In [6]:
# Initialize the model
model = xgb.XGBRegressor(n_jobs=-1, random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1,subsample=0.8,colsample_bytree=1.0,min_child_weight=5,gamma=0,reg_alpha=0.1,reg_lambda=10)
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_rmse_xgb = mean_squared_error(y_train, y_pred_train)
test_rmse_xgb = mean_squared_error(y_test, y_pred_test)
train_r2_xgb = r2_score(y_train, y_pred_train)
test_r2_xgb = r2_score(y_test, y_pred_test)

print(f"XGBoost - Train RMSE: {train_rmse_xgb:.2f}, Test RMSE: {test_rmse_xgb:.2f}")
print(f"XGBoost - Train R²: {train_r2_xgb:.2f}, Test R²: {test_r2_xgb:.2f}")


XGBoost - Train RMSE: 19.67, Test RMSE: 22.30
XGBoost - Train R²: 1.00, Test R²: 1.00


In [13]:
result = []
for k in range(0,25):
    hour = 2
    temp_df = last_5_months_df[(k*20*hour):((k+1)*20*hour)+460].head(480)[["time","real_consumption"]]

    for i in range(1,20*hour):
        
        extra_rows = []
    
        start_time = temp_df["time"].iloc[-1]
    
        extra_rows.append((start_time + timedelta(minutes=3), None))
    
        # Create a DataFrame for the extra rows
        extra_df = pd.DataFrame(extra_rows, columns=["time", "real_consumption"])

        # Combine the original and extra DataFrames
        temp_df = pd.concat([temp_df, extra_df], ignore_index=True)
        
        
        
        
        
        
        to_predict = feature_engineering(temp_df.copy())[["time","real_consumption","lag_1","ema_3",
                                                          "prev_day_consumption","rolling_mean_3h","weekly_avg",
                                                          "month_sin","monthly_avg","hour","rolling_std_1h",
                                                          'month',"pct_change"
                                                          ]].tail(1)
        to_predict_1 = to_predict.drop(columns=['real_consumption','time'])  # Features
        prediction = model.predict(to_predict_1)
        
        results = pd.DataFrame({
            'time' : to_predict["time"],
            'predicted_consumption': prediction
        })
        
        
        temp_df.loc[temp_df['time'] == results["time"].tail(1).values[0], ['real_consumption']] = [results["predicted_consumption"].tail(1).values[0]]

    result.append(temp_df.tail(20))
    

In [14]:
reuslts_df = pd.concat(result)
reuslts_df = reuslts_df.rename(columns={"real_consumption":"predicted_consumption_new"}).merge(last_5_months_df[["time","real_consumption","predicted_consumption"]], on="time", how="left").dropna()

In [15]:
# Create an interactive plot using Plotly
fig = px.line(reuslts_df, x='time', y=['real_consumption', 'predicted_consumption','predicted_consumption_new'], title="Interactive Time Series Plot")

# Customize the plot
fig.update_layout(
    xaxis_title="Timestamp",
    yaxis_title="Values",
    legend_title="Legend",
    xaxis_rangeslider_visible=True  # Enables the date slicer
)

# Show the plot
fig.show()

In [16]:
data_metrics(data=reuslts_df, real="real_consumption", predicted="predicted_consumption_new")

data_metrics(data=reuslts_df, real="real_consumption", predicted="predicted_consumption")

MAE: 97.1154
MSE: 17117.6153
RMSE: 130.8343
R²: 0.7437
MAE: 62.0446
MSE: 5202.0460
RMSE: 72.1252
R²: 0.9221


In [None]:
0.9768