In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path, parse_dates=['datetime'])
    return df

def compute_statistics(df):
    """Compute key statistical metrics for numerical features."""
    stats_summary = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        stats_summary[column] = {
            'Mean': np.mean(df[column]),
            'Median': np.median(df[column]),
            'Standard Deviation': np.std(df[column], ddof=1),
            'Skewness': stats.skew(df[column], nan_policy='omit'),
            'Kurtosis': stats.kurtosis(df[column], nan_policy='omit')
        }

    return pd.DataFrame(stats_summary)

def plot_time_series(df):
    """Plot electricity demand over time."""
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='datetime', y='value', label="Electricity Demand", color="blue")
    plt.xlabel("Time")
    plt.ylabel("Electricity Demand (MWh)")
    plt.title("Electricity Demand Over Time")
    plt.legend()
    plt.grid(True)
    plt.show()

def univariate_analysis(df, column):
    """Performs univariate analysis on a specified numerical column."""
    if column not in df.columns:
        print(f"Column '{column}' not found in dataset!")
        return

    sns.set_style("whitegrid")
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    sns.histplot(df[column], bins=30, kde=True, ax=axes[0], color='blue')
    axes[0].set_title(f'Histogram of {column}')

    sns.boxplot(y=df[column], ax=axes[1], color='green')
    axes[1].set_title(f'Boxplot of {column}')

    sns.kdeplot(df[column], fill=True, color='red', ax=axes[2])
    axes[2].set_title(f'Density Plot of {column}')

    plt.tight_layout()
    plt.show()

    print(df[column].describe())
    print(f"\nSkewness: {df[column].skew():.4f}")
    print(f"Kurtosis: {df[column].kurt():.4f}")

def correlation_analysis(df):
    """Computes and visualizes the correlation matrix for numerical features."""
    numerical_df = df.select_dtypes(include=['number'])
    correlation_matrix = numerical_df.corr()

    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Correlation Matrix Heatmap")
    plt.show()

def time_series_analysis(df):
    """Performs time series decomposition and stationarity test."""
    df = df.set_index('datetime').resample('H').mean().ffill()

    decomposition = seasonal_decompose(df['value'], model='additive', period=24)

    plt.figure(figsize=(10, 8))
    plt.subplot(411)
    plt.plot(df['value'], label='Original Time Series')
    plt.legend()
    plt.subplot(412)
    plt.plot(decomposition.trend, label='Trend', color='green')
    plt.legend()
    plt.subplot(413)
    plt.plot(decomposition.seasonal, label='Seasonality', color='orange')
    plt.legend()
    plt.subplot(414)
    plt.plot(decomposition.resid, label='Residuals', color='red')
    plt.legend()
    plt.tight_layout()
    plt.show()

    print("\nPerforming Augmented Dickey-Fuller Test...")
    adf_test = adfuller(df['value'].dropna())
    print(f"ADF Statistic: {adf_test[0]:.4f}")
    print(f"p-value: {adf_test[1]:.4f}")
    for key, value in adf_test[4].items():
        print(f"Critical Value ({key}): {value:.4f}")
    if adf_test[1] < 0.05:
        print("\nConclusion: The time series is stationary.")
    else:
        print("\nConclusion: The time series is NOT stationary.")

def main():
    file_path = 'cleaned_data.csv'
    df = load_data(file_path)

    print("Statistical Summary:")
    print(compute_statistics(df))

    plot_time_series(df)

    numerical_columns = ['value', 'temperature_2m', 'hour', 'day', 'month', 'year', 'weekday', 'is_weekend']
    for col in numerical_columns:
        print(f"\n{'='*40}\nUnivariate Analysis: {col}\n{'='*40}")
        univariate_analysis(df, col)

    correlation_analysis(df)

    time_series_analysis(df)

if __name__ == "__main__":
    main()

Statistical Summary:
                          value  temperature_2m       hour        day  \
Mean                1556.937606       11.346059  11.502223  15.721857   
Median              1120.000000       10.808500  12.000000  16.000000   
Standard Deviation  1455.568629        8.095719   6.921747   8.807689   
Skewness               2.404376        0.179754  -0.000464   0.006666   
Kurtosis               5.927457       -0.607164  -1.203871  -1.194763   

                       month         year   weekday  is_weekend  \
Mean                6.517648  2023.002887  2.998708    0.286121   
Median              7.000000  2023.000000  3.000000    0.000000   
Standard Deviation  3.450933     0.818553  2.001674    0.451948   
Skewness           -0.008226     0.005073  0.001633    0.946477   
Kurtosis           -1.208967    -1.482262 -1.252435   -1.104181   

                    season_Winter  season_Spring  ...  subba_Millwood  \
Mean                     0.247825       0.251662  ...        0.0