In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("../Dataset/Raw_data.csv")

# -------------------------------
# 2. Convert Timestamp & Extract Features
# -------------------------------
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month

# -------------------------------
# 3. Label Encoding
# -------------------------------
le_id = LabelEncoder()
le_type = LabelEncoder()
le_occ = LabelEncoder()

df['Building_ID'] = le_id.fit_transform(df['Building_ID'])
df['Building_Type'] = le_type.fit_transform(df['Building_Type'])
df['Occupancy_Level'] = le_occ.fit_transform(df['Occupancy_Level'])

# -------------------------------
# 4. Add Lag Features (Improves Model)
# -------------------------------
df['lag_1'] = df['Energy_Usage (kWh)'].shift(1)
df['lag_2'] = df['Energy_Usage (kWh)'].shift(2)

# Rolling average (smoothing)
df['rolling_mean_3'] = df['Energy_Usage (kWh)'].rolling(window=3).mean()

# -------------------------------
# 5. Remove rows with NA from lag features
# -------------------------------
df.dropna(inplace=True)

# -------------------------------
# 6. Final Preprocessed Dataset
# -------------------------------
print(df.head())
print("\nFinal Shape:", df.shape)

# Optional: save processed dataset
df.to_csv("../Dataset/preprocessed_energy_data.csv", index=False)


            Timestamp  Building_ID  Energy_Usage (kWh)  Temperature (Â°C)  \
2 2025-01-01 02:00:00           28              355.40             25.19   
3 2025-01-01 03:00:00           14               98.29             19.73   
4 2025-01-01 04:00:00           10              459.18             12.85   
5 2025-01-01 05:00:00            7              671.47             21.49   
6 2025-01-01 06:00:00           28              207.56             24.36   

   Humidity (%)  Building_Type  Occupancy_Level  Hour  DayOfWeek  Month  \
2         79.77              2                0     2          2      1   
3         78.52              2                1     3          2      1   
4         42.63              1                0     4          2      1   
5         77.89              1                0     5          2      1   
6         38.41              2                2     6          2      1   

    lag_1   lag_2  rolling_mean_3  
2  604.49  214.79      391.560000  
3  355.40  604.49  