In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [49]:
df = pd.read_pickle('../artifacts/processed_data.pkl')

In [50]:
df = df.fillna(0)

In [51]:
df['Date'] = pd.to_datetime(df['Date'])
df['hour'] = df['Date'].dt.hour
df['day'] = df['Date'].dt.day
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
df['day_of_week'] = df['Date'].dt.dayofweek

In [52]:
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4392 entries, 0 to 4391
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Date                          4392 non-null   datetime64[ns]
 1   City                          4392 non-null   object        
 2   Temperature (°C)              4392 non-null   float64       
 3   Humidity (%)                  4392 non-null   float64       
 4   Wind Speed (km/h)             4392 non-null   float64       
 5   Rainfall (mm)                 4392 non-null   float64       
 6   Solar Radiation (W/m²)        4392 non-null   float64       
 7   Electricity Demand (MW)       4392 non-null   float64       
 8   Sector                        4392 non-null   object        
 9   Energy Used (MWh)             4392 non-null   float64       
 10  Type of Energy Source         4392 non-null   object        
 11  Predicted Energy

In [53]:
print("\nFirst few rows of the dataset:")
print(df.head())


First few rows of the dataset:
        Date         City  Temperature (°C)  Humidity (%)  Wind Speed (km/h)  \
0 2024-01-01     Kakinada         29.869022     83.275001          15.979909   
1 2024-01-01     Kakinada         25.755087     80.316165          14.016725   
2 2024-01-01     Kakinada         35.821754     57.431869           7.727375   
3 2024-01-01  Samarlakota         30.615285     60.193020          14.177793   
4 2024-01-01  Samarlakota         30.928910     77.481159           7.995107   

   Rainfall (mm)  Solar Radiation (W/m²)  Electricity Demand (MW)  \
0      11.973170              362.407456                88.998630   
1      14.161452              308.233798               292.477463   
2       3.668090              421.696897               181.189108   
3       2.789877              416.857859               141.590461   
4      10.284689              536.965828                61.612603   

        Sector  Energy Used (MWh) Type of Energy Source  \
0  Residentia

In [54]:
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
Date                            0
City                            0
Temperature (°C)                0
Humidity (%)                    0
Wind Speed (km/h)               0
Rainfall (mm)                   0
Solar Radiation (W/m²)          0
Electricity Demand (MW)         0
Sector                          0
Energy Used (MWh)               0
Type of Energy Source           0
Predicted Energy Demand (MW)    0
Predicted Energy Used (MWh)     0
Demand Fluctuation (%)          0
hour                            0
day                             0
month                           0
year                            0
day_of_week                     0
dtype: int64


In [55]:
features = ['Temperature (°C)', 'Humidity (%)', 'Wind Speed (km/h)', 
           'Rainfall (mm)', 'Solar Radiation (W/m²)', 
           'hour', 'day', 'month', 'year', 'day_of_week']

In [56]:
df = pd.get_dummies(df, columns=['City', 'Sector', 'Type of Energy Source'])

In [57]:
X = df[features + [col for col in df.columns if col.startswith(('City_', 'Sector_', 'Type_'))]]
y = df['Electricity Demand (MW)']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [60]:
joblib.dump(scaler, '../artifacts/scaler.joblib')
np.save('../artifacts/X_train_scaled.npy', X_train_scaled)
np.save('../artifacts/X_test_scaled.npy', X_test_scaled)
np.save('../artifacts/y_train.npy', y_train)
np.save('../artifacts/y_test.npy', y_test)
X.to_pickle('../artifacts/feature_columns.pkl')