In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
from xgboost import XGBRegressor
import plotly.express as px
from pathlib import Path
import pandas as pd
import numpy as np
import holidays
import warnings

warnings.filterwarnings('ignore')

In [2]:
def upload_data():
    data_path = str(Path.cwd().parent) + "\\Data\\EPC\\Power Consumption Data.csv"
    
    df = pd.read_csv(data_path)
    
    df = df[df["real_consumption"] > 0]
    df = df[df['real_consumption'] <= df['real_consumption'].mean() + 4 * df['real_consumption'].std()]
    
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by='time',ascending=True)
    
    return df

def data_metrics(data, real, predicted):

    y_true = data[real]
    y_pred = data[predicted]

    # Calculate metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)


    # MAE (Mean Absolute Error):
    # Lower values are better; good MAE depends on the scale of 'real_consumption'.
    # As a rule of thumb, MAE should be significantly smaller than the mean of the target variable.
    # Lower is better. Ideally, MAE should be much less than the average value of y_true.
    print(f"MAE: {mae:.4f}")

    # MSE (Mean Squared Error):
    # Similar to MAE but penalizes large errors more heavily. A smaller MSE is better.
    # Compare MSE to the variance of 'real_consumption' for context.
    # Lower is better. MSE should ideally be close to zero relative to the variance of y_true.
    print(f"MSE: {mse:.4f}")



    # RMSE (Root Mean Squared Error):
    # RMSE is the square root of MSE and is in the same units as 'real_consumption'.
    # A good RMSE is often close to the standard deviation of 'real_consumption'.
    # Lower is better. RMSE should be comparable to or less than the standard deviation of y_true."
    print(f"RMSE: {rmse:.4f}")



    # R² (Coefficient of Determination):
    # R² measures how well the predictions explain the variability of the data.
    # Values close to 1.0 are excellent, indicating the model explains most of the variance.
    # Negative values indicate poor fit.
    # Closer to 1.0 is better. Values > 0.7 are generally good; < 0.5 indicates underfitting.
    print(f"R²: {r2:.4f}")

def feature_engineering(data):

    # Extracting basic time-based features
    data['hour'] = data['time'].dt.hour  # Hour of the day
    data['minute'] = data['time'].dt.minute  # Minute
    data['day_of_week'] = data['time'].dt.dayofweek  # Day of the week (0=Monday, 6=Sunday)
    data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # Weekend flag
    data['day_of_month'] = data['time'].dt.day
    data['week_of_year'] = data['time'].dt.isocalendar().week
    data['month'] = data['time'].dt.month
    data['quarter'] = data['time'].dt.quarter
    data['year'] = data['time'].dt.year


    # Generate lag features for temporal dependency modeling
    for lag in range(1, 5):  # Create lag features for the past 4 time steps
        data[f'lag_{lag}'] = data['real_consumption'].shift(lag)


    # Generate exponential moving averages
    for span in [3, 5]:  # Spans of size 3, 5, and 7
        data[f'ema_{span}'] = data['real_consumption'].ewm(span=span, adjust=False).mean()

    # Rolling average over a longer period (e.g., weekly and monthly moving averages)
    data['weekly_avg'] = data['real_consumption'].rolling(window=7*24*20, min_periods=1).mean()  # Weekly moving avg
    data['monthly_avg'] = data['real_consumption'].rolling(window=30*24*20, min_periods=1).mean()  # Monthly moving avg


# Percentage change in real consumption
    data['pct_change'] = data['real_consumption'].pct_change()

    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)  # Cyclic hour feature (sine)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)  # Cyclic hour feature (cosine)
 
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    
    data['week_of_year_sin'] = np.sin(2 * np.pi * data['week_of_year'] / 52)
    data['week_of_year_cos'] = np.cos(2 * np.pi * data['week_of_year'] / 52)




    # Get Georgia holidays for all years in the dataset
    georgia_holidays = holidays.Georgia(years=range(data["year"].min(), data["year"].max() + 1))

    data["date"] = data["time"].dt.date
    
    # Create holiday feature (1 if it's a holiday, 0 otherwise)
    data['is_holiday'] = data["date"].map(lambda x: 1 if x in georgia_holidays else 0)
    
    # Add features for the day before and after a holiday
    data['is_day_before_holiday'] = data["date"].map(lambda x: 1 if (x - pd.Timedelta(days=1)) in georgia_holidays else 0)
    data['is_day_after_holiday'] = data["date"].map(lambda x: 1 if (x + pd.Timedelta(days=1)) in georgia_holidays else 0)



    return  data



In [3]:
df = upload_data()

data_metrics(data=df, real="real_consumption", predicted="predicted_consumption")

df = feature_engineering(df)


MAE: 56.7256
MSE: 5824.3342
RMSE: 76.3173
R²: 0.9176


In [4]:
last_5_months_start = df['time'].max() - pd.DateOffset(months=5)
test_mask = df['time'] >= last_5_months_start

last_5_months_df = df[test_mask]
df = df[~test_mask]



In [5]:
# df.head(10)
df.columns

Index(['time', 'real_consumption', 'predicted_consumption', 'hour', 'minute',
       'day_of_week', 'is_weekend', 'day_of_month', 'week_of_year', 'month',
       'quarter', 'year', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'ema_3', 'ema_5',
       'weekly_avg', 'monthly_avg', 'pct_change', 'hour_sin', 'hour_cos',
       'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos',
       'week_of_year_sin', 'week_of_year_cos', 'date', 'is_holiday',
       'is_day_before_holiday', 'is_day_after_holiday'],
      dtype='object')

In [6]:
df = df[["time","real_consumption","predicted_consumption","hour","weekly_avg","monthly_avg","hour_sin",
         "hour_cos","lag_1","ema_3","month_sin","month_cos","week_of_year_sin","week_of_year_cos"]]



In [7]:
# Step 2: Separate features (X) and target variable (y)
X = df.drop(columns=[ 'time', 'real_consumption', 'predicted_consumption',"ema_3"])  # Features
y = df['real_consumption']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42,shuffle=True)

# Initialize the model
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1,subsample=0.8,colsample_bytree=1.0,min_child_weight=5,gamma=0,reg_alpha=0.1,reg_lambda=10)
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_rmse_xgb = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse_xgb = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2_xgb = r2_score(y_train, y_pred_train)
test_r2_xgb = r2_score(y_test, y_pred_test)

print(f"XGBoost - Train RMSE: {train_rmse_xgb:.2f}, Test RMSE: {test_rmse_xgb:.2f}")
print(f"XGBoost - Train R²: {train_r2_xgb:.2f}, Test R²: {test_r2_xgb:.2f}")


XGBoost - Train RMSE: 16.37, Test RMSE: 16.74
XGBoost - Train R²: 1.00, Test R²: 1.00


In [8]:
# Step 2: Separate features (X) and target variable (y)
X = df.drop(columns=[ 'time', 'real_consumption', 'predicted_consumption',"ema_3","lag_1"])  # Features
y = df['real_consumption']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42,shuffle=True)

# Initialize the model
model2 = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1,subsample=0.8,colsample_bytree=1.0,min_child_weight=5,gamma=0,reg_alpha=0.1,reg_lambda=10)
# Train the model
model2.fit(X_train, y_train)

# Make predictions
y_pred_train = model2.predict(X_train)
y_pred_test = model2.predict(X_test)

# Evaluate the model
train_rmse_xgb = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse_xgb = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2_xgb = r2_score(y_train, y_pred_train)
test_r2_xgb = r2_score(y_test, y_pred_test)

print(f"XGBoost - Train RMSE: {train_rmse_xgb:.2f}, Test RMSE: {test_rmse_xgb:.2f}")
print(f"XGBoost - Train R²: {train_r2_xgb:.2f}, Test R²: {test_r2_xgb:.2f}")


XGBoost - Train RMSE: 61.30, Test RMSE: 62.39
XGBoost - Train R²: 0.95, Test R²: 0.94


In [15]:
result = []
for k in range(0,50):
    hour = 1
    temp_df = last_5_months_df[(int(k*20*hour)):int((k+1)*20*hour)].head(9)[["time","real_consumption"]]

    for i in range(1,int(20*hour)):


        for i in range(1):  # Adding 3 extra rows
            extra_rows = []

        start_time = temp_df["time"].iloc[-1] + timedelta(minutes=3)

        extra_rows.append((start_time + timedelta(minutes=3 * i), None))

        # Create a DataFrame for the extra rows
        extra_df = pd.DataFrame(extra_rows, columns=["time", "real_consumption"])

        # Combine the original and extra DataFrames
        temp_df = pd.concat([temp_df, extra_df], ignore_index=True)






        to_predict = feature_engineering(temp_df.copy())[["time","real_consumption","hour","weekly_avg","monthly_avg","hour_sin","hour_cos","lag_1","ema_3","month_sin","month_cos","week_of_year_sin","week_of_year_cos"]].tail(1)
        to_predict_1 = to_predict.drop(columns=['real_consumption','time','ema_3'])  # Features
        # to_predict_1["ema_5"] = to_predict_1["ema_5"]
        prediction = model.predict(to_predict_1)

        results = pd.DataFrame({
            'time' : to_predict["time"],
            'predicted_consumption': prediction
        })


        temp_df.loc[temp_df['time'] == results["time"].tail(1).values[0], ['real_consumption']] = [results["predicted_consumption"].tail(1).values[0]]

    result.append(temp_df[9:])


In [16]:
result2 = []
for k in range(0,50):
    hour = 1
    temp_df = last_5_months_df[(int(k*20*hour)):int((k+1)*20*hour)].head(9)[["time","real_consumption"]]

    for i in range(1,int(20*hour)):


        for i in range(1):  # Adding 3 extra rows
            extra_rows = []

        start_time = temp_df["time"].iloc[-1] + timedelta(minutes=3)

        extra_rows.append((start_time + timedelta(minutes=3 * i), None))

        # Create a DataFrame for the extra rows
        extra_df = pd.DataFrame(extra_rows, columns=["time", "real_consumption"])

        # Combine the original and extra DataFrames
        temp_df = pd.concat([temp_df, extra_df], ignore_index=True)






        to_predict = feature_engineering(temp_df.copy())[["time","real_consumption","hour","weekly_avg","monthly_avg","hour_sin","hour_cos","lag_1","ema_3","month_sin","month_cos","week_of_year_sin","week_of_year_cos"]].tail(1)
        to_predict_1 = to_predict.drop(columns=['real_consumption','time','ema_3','lag_1'])  # Features
        # to_predict_1["ema_5"] = to_predict_1["ema_5"]
        prediction = model2.predict(to_predict_1)

        results = pd.DataFrame({
            'time' : to_predict["time"],
            'predicted_consumption': prediction
        })


        temp_df.loc[temp_df['time'] == results["time"].tail(1).values[0], ['real_consumption']] = [results["predicted_consumption"].tail(1).values[0]]

    result2.append(temp_df[9:])


In [17]:
reuslts_df = pd.concat(result)
reuslts_df = reuslts_df.rename(columns={"real_consumption":"predicted_consumption_new"}).merge(last_5_months_df[["time","real_consumption","predicted_consumption"]], on="time", how="left").dropna()

In [18]:
reuslts_df2 = pd.concat(result2)
reuslts_df2 = reuslts_df2.rename(columns={"real_consumption":"predicted_consumption_new_12"})

reuslts_df = reuslts_df.merge(reuslts_df2[["time","predicted_consumption_new_12"]], on="time", how="left").dropna()


In [19]:
# Create an interactive plot using Plotly
fig = px.line(reuslts_df, x='time', y=['real_consumption', 'predicted_consumption','predicted_consumption_new','predicted_consumption_new_12'], title="Interactive Time Series Plot")

# Customize the plot
fig.update_layout(
    xaxis_title="Timestamp",
    yaxis_title="Values",
    legend_title="Legend",
    xaxis_rangeslider_visible=True  # Enables the date slicer
)

# Show the plot
fig.show()

In [20]:
data_metrics(data=reuslts_df, real="real_consumption", predicted="predicted_consumption_new")

data_metrics(data=reuslts_df, real="real_consumption", predicted="predicted_consumption_new_12")

data_metrics(data=reuslts_df, real="real_consumption", predicted="predicted_consumption")

MAE: 30.5480
MSE: 1536.5134
RMSE: 39.1984
R²: 0.9755
MAE: 163.6089
MSE: 32840.6780
RMSE: 181.2200
R²: 0.4753
MAE: 83.3045
MSE: 8800.2009
RMSE: 93.8094
R²: 0.8594
