Importing required libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

Loading the dataset

In [3]:
file_path = "data/power_consumption.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

Changing the date column to date time formate and setting the date as index

In [4]:
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

Handling missing values


In [5]:
df.ffill(inplace=True)

Outlier Detection & Removal (Using IQR(Inter Quartile Range) method)

In [6]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), np.nan, df[column])
    df[column] = df[column].ffill()  # Forward fill again after removing outliers

remove_outliers(df, 'Appliances')

Extracting the time based features

In [7]:
df['hour'] = df.index.hour
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df['day_of_week'] = df.index.dayofweek

Encoding Cyclical Features
it helps for time series analysis by making days into hours,weeks into days, months to weeks

In [8]:
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

Data Smoothing (Rolling Mean) or Moving average

In [9]:
df['Appliances_smoothed'] = df['Appliances'].rolling(window=5, min_periods=1).mean()

Feature Transformation (Log transformation for skewed features)

In [10]:
skewed_features = ['Appliances', 'lights', 'rv1', 'rv2']  # Identify skewed features
df[skewed_features] = df[skewed_features].apply(lambda x: np.log1p(x))

 Creating Additional Lag Features
 lag features are the previous values of time series which will help us to forecast 

In [11]:
df['lag_3'] = df['Appliances'].shift(3)
df['lag_4'] = df['Appliances'].shift(4)
df['rolling_mean_3'] = df['Appliances'].rolling(window=3).mean()
df['rolling_mean_5'] = df['Appliances'].rolling(window=5).mean()

df.ffill(inplace=True)  # Handle new NaN values from lagging


Normalize Numerical Features

In [12]:
scaler = MinMaxScaler()
num_cols = ['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3',
            'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8',
            'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 
            'Tdewpoint', 'rv1', 'rv2', 'lag_1', 'lag_2', 'lag_3', 'lag_4',
            'rolling_mean_3', 'rolling_mean_5', 'Appliances_smoothed']
df[num_cols] = scaler.fit_transform(df[num_cols])

Saving the preprocessed dataset

In [13]:
preprocessed_file = "data/preprocessed_power_consumption.csv"
df.to_csv(preprocessed_file, index=True)

print("Advanced Preprocessing completed. Data saved as preprocessed_power_consumption.csv")


Advanced Preprocessing completed. Data saved as preprocessed_power_consumption.csv
