In [7]:
#Importing libraries
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

In [None]:
#Defining the base directory
notebook_dir = os.getcwd()
base_dir = os.path.join(notebook_dir, 'Hourly Energy Consumption')

#List of file names
file_names = [
    "AEP_hourly.csv", "COMED_hourly.csv", "DAYTON_hourly.csv", 
    "DEOK_hourly.csv", "DOM_hourly.csv", "DUK_hourly.csv", 
    "EKPC_hourly.csv", "FE_hourly.csv", "NI_hourly.csv", 
    "PJM_Load_hourly.csv", "PJME_hourly.csv", "PJMW_hourly.csv"
]

#Loading the datasets into a dictionary
dataframes = {}
for file_name in file_names:
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        dataframes[file_name] = pd.read_csv(file_path)
        print(f"Loaded {file_name}")
    else:
        print(f"File not found: {file_name}")

#Combining all datasets into one
combined_df = pd.concat(dataframes.values(), ignore_index=True)

#Preprocessing
if 'Datetime' in combined_df.columns:
    combined_df['Datetime'] = pd.to_datetime(combined_df['Datetime'])
else:
    print("No 'Datetime' column found, please check the dataset structure.")

#Fill in any missing values
combined_df = combined_df.fillna(method='ffill')

#Sort by datetime
combined_df = combined_df.sort_values(by='Datetime')

#Display the first few rows of the preprocessed dataset
combined_df.head()



Loaded AEP_hourly.csv
Loaded COMED_hourly.csv
Loaded DAYTON_hourly.csv
Loaded DEOK_hourly.csv
Loaded DOM_hourly.csv
File not found: DUK_hourly.csv
Loaded EKPC_hourly.csv
Loaded FE_hourly.csv
Loaded NI_hourly.csv
Loaded PJM_Load_hourly.csv
Loaded PJME_hourly.csv
Loaded PJMW_hourly.csv


Unnamed: 0,Datetime,AEP_MW,COMED_MW,DAYTON_MW,DEOK_MW,DOM_MW,EKPC_MW,FE_MW,NI_MW,PJM_Load_MW,PJME_MW,PJMW_MW
656205,1998-04-01 01:00:00,19993.0,12816.0,2552.0,4100.0,17428.0,2846.0,8393.0,12223.0,22259.0,,
656206,1998-04-01 02:00:00,19993.0,12816.0,2552.0,4100.0,17428.0,2846.0,8393.0,12223.0,21244.0,,
656207,1998-04-01 03:00:00,19993.0,12816.0,2552.0,4100.0,17428.0,2846.0,8393.0,12223.0,20651.0,,
656208,1998-04-01 04:00:00,19993.0,12816.0,2552.0,4100.0,17428.0,2846.0,8393.0,12223.0,20421.0,,
656209,1998-04-01 05:00:00,19993.0,12816.0,2552.0,4100.0,17428.0,2846.0,8393.0,12223.0,20713.0,,


In [9]:
#Display basic information about the dataset
print(combined_df.info())
print(combined_df.isnull().sum())
print(combined_df.describe())

#Filter Data
numerical_cols = combined_df.select_dtypes(include=[np.number]).columns
Q1 = combined_df[numerical_cols].quantile(0.25)
Q3 = combined_df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
combined_df = combined_df[~((combined_df[numerical_cols] < (Q1 - 1.5 * IQR)) | (combined_df[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]



<class 'pandas.core.frame.DataFrame'>
Index: 971099 entries, 656205 to 116161
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Datetime     971099 non-null  datetime64[ns]
 1   AEP_MW       971099 non-null  float64       
 2   COMED_MW     849826 non-null  float64       
 3   DAYTON_MW    783329 non-null  float64       
 4   DEOK_MW      662054 non-null  float64       
 5   DOM_MW       604315 non-null  float64       
 6   EKPC_MW      488126 non-null  float64       
 7   FE_MW        442792 non-null  float64       
 8   NI_MW        379918 non-null  float64       
 9   PJM_Load_MW  321468 non-null  float64       
 10  PJME_MW      288572 non-null  float64       
 11  PJMW_MW      143206 non-null  float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 96.3 MB
None
Datetime            0
AEP_MW              0
COMED_MW       121273
DAYTON_MW      187770
DEOK_MW        309045
DOM_MW       

In [10]:
#Feature Selection 1 - Correlation Method
#Compute the correlation matrix andchoose top features
correlation_matrix = combined_df.corr()
if 'Consumption' in correlation_matrix.columns:
    correlated_features = correlation_matrix['Consumption'].sort_values(ascending=False).head(5).index.tolist()
    print("Top Correlated Features:", correlated_features)
else:
    print("Column 'Consumption' not found in the DataFrame.")


Column 'Consumption' not found in the DataFrame.


In [12]:
#Define the target variable
combined_df['Consumption'] = combined_df[
    ['AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM_MW', 
     'EKPC_MW', 'FE_MW', 'NI_MW', 'PJM_Load_MW']
].sum(axis=1)

#Drop 'Datetime' and target from features
X = combined_df.drop(columns=['Datetime', 'Consumption'])
y = combined_df['Consumption']

#Fill missing values
X = X.fillna(0)

#Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Evaluate the model
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

#Feature importance
feature_importances = model.feature_importances_
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance}")

Mean Squared Error: 21.00540718458843
AEP_MW: 0.0
COMED_MW: 0.002360057077408043
DAYTON_MW: 0.00012471945525626695
DEOK_MW: 0.0004491077113055066
DOM_MW: 0.04810574647129831
EKPC_MW: 0.006023127527342967
FE_MW: 0.0012483064338623332
NI_MW: 0.839770152283662
PJM_Load_MW: 0.1014000254164632
PJME_MW: 0.0005187576234013169
PJMW_MW: 0.0
