In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../../Dataset1/Cleaned/electricity_demand_2001_2025_mean.csv")

In [2]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("Numeric columns:", numeric_cols.tolist())

Numeric columns: ['settlement_period', 'nd', 'tsd', 'england_wales_demand', 'embedded_wind_generation', 'embedded_wind_capacity', 'embedded_solar_generation', 'embedded_solar_capacity', 'non_bm_stor', 'pump_storage_pumping', 'scottish_transfer', 'ifa_flow', 'ifa2_flow', 'britned_flow', 'moyle_flow', 'east_west_flow', 'nemo_flow', 'nsl_flow', 'eleclink_flow', 'viking_flow', 'greenlink_flow', 'year']


In [3]:
outliers = pd.DataFrame()

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    mask = (df[col] < lower_bound) | (df[col] > upper_bound)
    outliers_in_col = df[mask]
    
    if not outliers_in_col.empty:
        print(f"\nOutliers detected in column: {col}")
        print(outliers_in_col[[col]].head())
        
    outliers = pd.concat([outliers, outliers_in_col])


Outliers detected in column: nd
          nd
34017  59770
34018  60333
34019  59544
35362  59892
35410  59899

Outliers detected in column: tsd
       tsd
70128  0.0
70129  0.0
70130  0.0
70131  0.0
70132  0.0

Outliers detected in column: england_wales_demand
        england_wales_demand
34018                  54430
35410                  54015
122001                 54000
122002                 54221
180929                     0

Outliers detected in column: embedded_wind_generation
        embedded_wind_generation
241891                    2534.0
241892                    2534.0
241893                    2677.0
241894                    2677.0
241895                    2718.0

Outliers detected in column: embedded_solar_generation
        embedded_solar_generation
216168                     2460.0
216169                     2540.0
216171                     2470.0
216217                     2420.0
216218                     2470.0

Outliers detected in column: embedded_solar_capaci

In [11]:
cols_to_clip = ['ifa_flow', 'ifa2_flow', 'britned_flow', 'east_west_flow']
# Check before clipping
for col in cols_to_clip:
    if col in df.columns:
        print(f"Before clipping {col}: min={df[col].min()}, max={df[col].max()}")
print()


for col in cols_to_clip:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower_bound, upper_bound)


# Check after clipping
for col in cols_to_clip:
    if col in df.columns:
        print(f"After clipping {col}: min={df[col].min()}, max={df[col].max()}")


df.to_csv("../../Dataset1/Cleaned/electricity_demand_2001_2025_cleaned_outliers.csv", index=False)

Before clipping ifa_flow: min=-2070, max=2194
Before clipping ifa2_flow: min=-125.87457533071439, max=209.790958884524
Before clipping britned_flow: min=-667.5, max=1143.0
Before clipping east_west_flow: min=-143.31941598186262, max=85.99164958911757

After clipping ifa_flow: min=-2070, max=2194
After clipping ifa2_flow: min=-125.87457533071439, max=209.790958884524
After clipping britned_flow: min=-667.5, max=1143.0
After clipping east_west_flow: min=-143.31941598186262, max=85.99164958911757
