# Exploratory Data Analysis

## 1. Wrangling the Data

In [1]:
# Import all the necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# Read the dataset into a pandas dataframe
df = pd.read_csv("/home/williams/Projects/VET_DL/Data/raw/synthetic_ieee14_data.csv")
# Display the first few rows of the dataframe
print(df.head())

   time_step                                       bus_voltages  \
0          0  [1.06, 1.0450000000000004, 1.0100000000000002,...   
1          1  [1.06, 1.045, 1.01, 1.0105663409331898, 1.0127...   
2          2  [1.06, 1.0449999999999997, 1.01, 0.99948161644...   
3          3  [1.06, 1.0449999999999995, 1.0100000000000005,...   
4          4  [1.06, 1.045, 1.0100000000000007, 0.9849726893...   

                                          bus_angles  \
0  [0.0, -5.689938615261936, -14.321740579290003,...   
1  [0.0, -6.57357321457588, -16.310933714152196, ...   
2  [0.0, -8.836314978861031, -21.378367630264485,...   
3  [0.0, -9.881780213970647, -23.70727880495562, ...   
4  [0.0, -11.503930918943194, -27.305755445456825...   

                                              load_p  \
0  [23.96211142327013, 104.01985696184546, 52.782...   
1  [26.74193049294084, 116.0870899739644, 58.9061...   
2  [33.62082282377392, 145.9484566820048, 74.0587...   
3  [36.6795829828559, 159.2265768195

In [3]:
import ast  # Abstract Syntax Tree for safe evaluation


# Convert stringified lists to actual lists
columns_to_fix = ['bus_voltages', 'bus_angles', 'load_p', 'load_q', 'gen_p', 'gen_q']

for col in columns_to_fix:
    df[col] = df[col].apply(ast.literal_eval)  # safe way to parse stringified list to actual list

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   time_step     1000 non-null   int64 
 1   bus_voltages  1000 non-null   object
 2   bus_angles    1000 non-null   object
 3   load_p        1000 non-null   object
 4   load_q        1000 non-null   object
 5   gen_p         1000 non-null   object
 6   gen_q         1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [4]:
# Check the type of the first row in each column
columns_to_check = ['bus_voltages', 'bus_angles', 'load_p', 'load_q', 'gen_p', 'gen_q']

for col in columns_to_check:
    print(f"{col}: type of first entry = {type(df[col].iloc[0])}")


bus_voltages: type of first entry = <class 'list'>
bus_angles: type of first entry = <class 'list'>
load_p: type of first entry = <class 'list'>
load_q: type of first entry = <class 'list'>
gen_p: type of first entry = <class 'list'>
gen_q: type of first entry = <class 'list'>


In [5]:
print(f"First 3 elements in 'bus_voltages': {df['bus_voltages'].iloc[0][:3]}")
print(f"Type of first element: {type(df['bus_voltages'].iloc[0][0])}")


First 3 elements in 'bus_voltages': [1.06, 1.0450000000000004, 1.0100000000000002]
Type of first element: <class 'float'>


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   time_step     1000 non-null   int64 
 1   bus_voltages  1000 non-null   object
 2   bus_angles    1000 non-null   object
 3   load_p        1000 non-null   object
 4   load_q        1000 non-null   object
 5   gen_p         1000 non-null   object
 6   gen_q         1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [7]:
# Define a function for EDA

def eda_power_data(df):
    summary = {}

    # Dataset dimensions
    summary['shape'] = df.shape

    # Data types
    summary['dtypes'] = df.dtypes

    # Missing values
    summary['missing_values'] = df.isnull().sum()

    # Sample data
    summary['head'] = df.head(3)

    # Descriptive statistics (only for numeric columns and array-lengths)
    descriptive_stats = {}
    for col in df.columns:
        if df[col].dtype == 'object' and isinstance(df[col].iloc[0], list):
            lengths = df[col].apply(len)
            descriptive_stats[f'{col}_length_stats'] = lengths.describe()
        elif pd.api.types.is_numeric_dtype(df[col]):
            descriptive_stats[col] = df[col].describe()
    summary['descriptive_stats'] = descriptive_stats

    # Print basic overview
    print("========== EDA Summary ==========")
    print("Shape:", summary['shape'])
    print("\nData Types:\n", summary['dtypes'])
    print("\nMissing Values:\n", summary['missing_values'])
    print("\nSample Rows:\n", summary['head'])
    print("\n--- Descriptive Statistics ---")
    for key, stats in descriptive_stats.items():
        print(f"\n{key}:\n", stats)

    return summary



In [8]:
# Print EDA summary
eda_summary = eda_power_data(df)
# Save the EDA summary to a text file
with open("/home/williams/Projects/VET_DL/Data/processed/eda_summary.txt", "w") as f:
    f.write("========== EDA Summary ==========\n")
    f.write(f"Shape: {eda_summary['shape']}\n")
    f.write("\nData Types:\n")
    f.write(str(eda_summary['dtypes']))
    f.write("\n\nMissing Values:\n")
    f.write(str(eda_summary['missing_values']))
    f.write("\n\nSample Rows:\n")
    f.write(str(eda_summary['head']))
    f.write("\n\n--- Descriptive Statistics ---\n")
    for key, stats in eda_summary['descriptive_stats'].items():
        f.write(f"\n{key}:\n{stats}\n")

Shape: (1000, 7)

Data Types:
 time_step        int64
bus_voltages    object
bus_angles      object
load_p          object
load_q          object
gen_p           object
gen_q           object
dtype: object

Missing Values:
 time_step       0
bus_voltages    0
bus_angles      0
load_p          0
load_q          0
gen_p           0
gen_q           0
dtype: int64

Sample Rows:
    time_step                                       bus_voltages  \
0          0  [1.06, 1.0450000000000004, 1.0100000000000002,...   
1          1  [1.06, 1.045, 1.01, 1.0105663409331898, 1.0127...   
2          2  [1.06, 1.0449999999999997, 1.01, 0.99948161644...   

                                          bus_angles  \
0  [0.0, -5.689938615261936, -14.321740579290003,...   
1  [0.0, -6.57357321457588, -16.310933714152196, ...   
2  [0.0, -8.836314978861031, -21.378367630264485,...   

                                              load_p  \
0  [23.96211142327013, 104.01985696184546, 52.782...   
1  [26.741930492

## 2. Visual EDA

In [28]:
# Import the necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Define a function to perform visual EDA for time series data aand save plots
# to disk
def perform_visual_eda(df, return_df=False, save_dir='/home/williams/Projects/VET_DL/Results/plots/eda_results'):
    """
    Performs visual EDA on power grid time-series data and saves plots to disk.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - return_df (bool): Return modified DataFrame with summary columns.
    - save_dir (str): Directory to save the plots.

    Returns:
    - Modified DataFrame if return_df=True
    """
    
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Create summary columns
    df['mean_bus_voltage'] = df['bus_voltages'].apply(np.mean)
    df['mean_bus_angle'] = df['bus_angles'].apply(np.mean)
    df['sum_load_p'] = df['load_p'].apply(np.sum)
    df['sum_load_q'] = df['load_q'].apply(np.sum)
    df['sum_gen_p'] = df['gen_p'].apply(np.sum)
    df['sum_gen_q'] = df['gen_q'].apply(np.sum)

    # Set plot style
    sns.set(style="whitegrid")

    # Plot 1: Mean Bus Voltage over Time
    plt.figure(figsize=(12, 4))
    sns.lineplot(x='time_step', y='mean_bus_voltage', data=df)
    plt.title('Mean Bus Voltage Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Mean Voltage')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'mean_bus_voltage.png'))
    plt.close()

    # Plot 2: Mean Bus Angle over Time
    plt.figure(figsize=(12, 4))
    sns.lineplot(x='time_step', y='mean_bus_angle', data=df)
    plt.title('Mean Bus Angle Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Mean Angle')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'mean_bus_angle.png'))
    plt.close()

    # Plot 3: Real Power (P) Over Time
    plt.figure(figsize=(14, 5))
    sns.lineplot(x='time_step', y='sum_load_p', data=df, label='Total Load P')
    sns.lineplot(x='time_step', y='sum_gen_p', data=df, label='Total Gen P')
    plt.title('Real Power (P) Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Power (MW)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'real_power_P.png'))
    plt.close()

    # Plot 4: Reactive Power (Q) Over Time
    plt.figure(figsize=(14, 5))
    sns.lineplot(x='time_step', y='sum_load_q', data=df, label='Total Load Q')
    sns.lineplot(x='time_step', y='sum_gen_q', data=df, label='Total Gen Q')
    plt.title('Reactive Power (Q) Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Reactive Power (MVar)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'reactive_power_Q.png'))
    plt.close()

    print(f"✅ Visual EDA complete. Plots saved to: {save_dir}")

    if return_df:
        return df


In [29]:
# Just run EDA and view plots
perform_visual_eda(df)

✅ Visual EDA complete. Plots saved to: /home/williams/Projects/VET_DL/Results/plots/eda_results


In [15]:
# Save and get the modified DataFrame with summary columns
df_summary = perform_visual_eda(df, return_df=True)
df_summary.to_csv("/home/williams/Projects/VET_DL/Data/processed/synthetic_ieee14_data_summary.csv", index=False)
df_summary.head()

✅ Visual EDA complete. Plots saved to: /home/williams/Projects/VET_DL/Results/plots


Unnamed: 0,time_step,bus_voltages,bus_angles,load_p,load_q,gen_p,gen_q,mean_bus_voltage,mean_bus_angle,sum_load_p,sum_load_q,sum_gen_p,sum_gen_q
0,0,"[1.06, 1.0450000000000004, 1.0100000000000002,...","[0.0, -5.689938615261936, -14.321740579290003,...","[23.96211142327013, 104.01985696184546, 52.782...","[14.023908528826299, 20.980650554936982, -4.30...","[40.0, 0.0, 0.0, 0.0]","[54.06772755189817, 32.90656438591154, 17.9511...",1.046199,-13.508905,285.999394,81.16199,40.0,124.486857
1,1,"[1.06, 1.045, 1.01, 1.0105663409331898, 1.0127...","[0.0, -6.57357321457588, -16.310933714152196, ...","[26.74193049294084, 116.0870899739644, 58.9061...","[15.650807247020676, 23.414593519164793, -4.80...","[40.0, 0.0, 0.0, 0.0]","[68.03302118822202, 42.879219944267376, 24.657...",1.043304,-15.316967,319.17788,90.577507,40.0,157.617798
2,2,"[1.06, 1.0449999999999997, 1.01, 0.99948161644...","[0.0, -8.836314978861031, -21.378367630264485,...","[33.62082282377392, 145.9484566820048, 74.0587...","[19.676702758614233, 29.43758680422602, -6.042...","[40.0, 0.0, 0.0, 0.0]","[107.94684349953482, 69.3236913189462, 42.7456...",1.035627,-19.933862,401.280789,113.876981,40.0,248.755189
3,3,"[1.06, 1.0449999999999995, 1.0100000000000005,...","[0.0, -9.881780213970647, -23.70727880495562, ...","[36.6795829828559, 159.22657681958648, 80.7965...","[21.46685271346866, 32.11576390203973, -6.5921...","[40.0, 0.0, 0.0, 0.0]","[128.36612847578445, 81.95465043659755, 51.534...",1.031956,-22.062143,437.788571,124.237297,40.0,293.839514
4,4,"[1.06, 1.045, 1.0100000000000007, 0.9849726893...","[0.0, -11.503930918943194, -27.305755445456825...","[41.27147309793498, 179.1600352914966, 90.9113...","[24.15427227390665, 36.13631285072648, -7.4174...","[40.0, 0.0, 0.0, 0.0]","[162.46665040665448, 102.02841649983705, 65.69...",1.026111,-25.360369,492.595001,139.790473,40.0,367.395144


## Plot Individual Bus Voltage Trends Over Time

Since bus_voltages is a list per row, we first convert it into a DataFrame of shape (time_step, n_buses). Then we can plot each bus’s voltage trend.

In [35]:
def plot_individual_bus_voltages(df, save_dir=('/home/williams/Projects/VET_DL/Results/plots/individual_bus_voltages')):
    voltage_matrix = np.vstack(df['bus_voltages'].values)  # shape: (timesteps, buses)
    time = df['time_step'].values

    plt.figure(figsize=(14, 6))
    for i in range(voltage_matrix.shape[1]):
        plt.plot(time, voltage_matrix[:, i], label=f'Bus {i+1}')
    plt.title('Individual Bus Voltage Trends Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Voltage (p.u.)')
    plt.legend(loc='best', ncol=2)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'individual_bus_voltages.png'))
    plt.close()
    print(f"✅ Individual bus voltages plot saved to: {save_dir}")

## PCA & t-SNE for Dimensionality Reduction Visualization
We’ll apply PCA and t-SNE on the bus_voltages matrix and plot the 2D projections.

In [36]:
# Import the necessary libraries for visualization
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def plot_dimensionality_reduction(df, save_dir=("/home/williams/Projects/VET_DL/Results/plots/dimensionality_reduction")):
    voltage_matrix = np.vstack(df['bus_voltages'].values)

    # PCA
    pca = PCA(n_components=2)
    voltage_pca = pca.fit_transform(voltage_matrix)
    plt.figure(figsize=(6, 5))
    plt.scatter(voltage_pca[:, 0], voltage_pca[:, 1], c=df['time_step'], cmap='viridis', s=20)
    plt.title('PCA Projection of Bus Voltages')
    plt.colorbar(label='Time Step')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'pca_projection.png'))
    plt.close()

    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    voltage_tsne = tsne.fit_transform(voltage_matrix)
    plt.figure(figsize=(6, 5))
    plt.scatter(voltage_tsne[:, 0], voltage_tsne[:, 1], c=df['time_step'], cmap='plasma', s=20)
    plt.title('t-SNE Projection of Bus Voltages')
    plt.colorbar(label='Time Step')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'tsne_projection.png'))
    plt.close()

## Correlation Heatmap from Flattened Arrays
We flatten list-type columns to 2D arrays and compute correlation between buses over time.

In [37]:
# Define function to plot correlation heatmap
def plot_voltage_correlation_heatmap(df, save_dir=('/home/williams/Projects/VET_DL/Results/plots/correlation_heatmap')):
    voltage_matrix = np.vstack(df['bus_voltages'].values)
    corr = np.corrcoef(voltage_matrix.T)  # shape: (buses, buses)

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=False, cmap='coolwarm', square=True,
                xticklabels=[f'Bus {i+1}' for i in range(corr.shape[0])],
                yticklabels=[f'Bus {i+1}' for i in range(corr.shape[0])])
    plt.title('Correlation Heatmap of Bus Voltages')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'bus_voltage_correlation_heatmap.png'))
    plt.close()

In [39]:
def extended_visual_eda(df, save_dir=('/home/williams/Projects/VET_DL/Results/plots/extended_eda')):
    os.makedirs(save_dir, exist_ok=True)
    print("📊 Running extended visual EDA...")

    # Call each function
    plot_individual_bus_voltages(df, save_dir)
    plot_dimensionality_reduction(df, save_dir)
    plot_voltage_correlation_heatmap(df, save_dir)

    print(f"✅ Extended EDA plots saved to: {save_dir}")

In [40]:
# Run the extended visual EDA
extended_visual_eda(df)

📊 Running extended visual EDA...
✅ Individual bus voltages plot saved to: /home/williams/Projects/VET_DL/Results/plots/extended_eda
✅ Extended EDA plots saved to: /home/williams/Projects/VET_DL/Results/plots/extended_eda
