In [1]:
import pandas as pd

In [2]:
data_path = "/data3/lsf/Pein/Power-Prediction/data/train_data.csv"
df = pd.read_csv(data_path)

In [3]:
# Convert the time column to datetime and set it as the index
df["time"] = pd.to_datetime(df["time"])
df.set_index("time", inplace=True)

# Ensure the index is uniformly spaced at 15 minutes
df = df.asfreq("15min")

# Fill missing values if any (e.g., using interpolation)
# df["power"].interpolate(method="time", inplace=True)

In [4]:
from statsmodels.tsa.seasonal import STL
import numpy as np
import matplotlib.pyplot as plt

In [5]:
def calculate_strengths(original, trend, seasonal, residual):
    total_var = np.var(original)
    trend_strength = 1 - (np.var(residual) / total_var)
    seasonal_strength = 1 - (np.var(residual + trend) / total_var)
    residual_strength = np.var(residual) / total_var

    return trend_strength, seasonal_strength, residual_strength


def apply_stl_decomposition(series, period, seasonal):
    stl = STL(series, period=period, seasonal=seasonal)
    result = stl.fit()

    # Extract the components
    trend = result.trend
    seasonal = result.seasonal
    residual = result.resid

    # Calculate strengths
    trend_strength, seasonal_strength, residual_strength = calculate_strengths(
        series, trend, seasonal, residual
    )

    return (
        trend_strength,
        seasonal_strength,
        residual_strength,
        trend,
        seasonal,
        residual,
    )


def apply_stl_decomposition_with_plots(series, period, title):
    trend_strength, seasonal_strength, residual_strength, trend, seasonal, residual = (
        apply_stl_decomposition(series, period)
    )

    # Plot the components
    plt.figure(figsize=(12, 8))

    plt.subplot(4, 1, 1)
    plt.plot(series.index, series, label="Original")
    plt.legend(loc="upper left")

    plt.subplot(4, 1, 2)
    plt.plot(trend.index, trend, label="Trend")
    plt.legend(loc="upper left")

    plt.subplot(4, 1, 3)
    plt.plot(seasonal.index, seasonal, label="Seasonal")
    plt.legend(loc="upper left")

    plt.subplot(4, 1, 4)
    plt.plot(residual.index, residual, label="Residual")
    plt.legend(loc="upper left")

    plt.tight_layout()
    plt.suptitle(title, y=1.02)
    plt.show()

    return trend_strength, seasonal_strength, residual_strength


def find_best_periods_and_seasonal(
    series, min_period=4, max_period=2880, top_n=3, seasonal_values=[7, 15, 31]
):
    results = []

    period_values = [4, 8, 16, 32, 64, 96, 192, 672, 2880]

    for period in period_values:
        for seasonal in seasonal_values:
            trend_strength, seasonal_strength, residual_strength, _, _, _ = (
                apply_stl_decomposition(series, period, seasonal)
            )
            results.append(
                (period, seasonal, trend_strength, seasonal_strength, residual_strength)
            )

    # Sort results by residual strength
    results.sort(key=lambda x: x[4])  # Sort by residual_strength
    return results[:top_n]


def analyze_series(series, top_n=3, plot=False):
    # Ensure the series has a uniform frequency and fill missing values
    series = series.resample("15T").mean()
    series.interpolate(method="time", inplace=True)

    # Find the top periods and seasonal values with the lowest residual strength
    top_periods_and_seasonal = find_best_periods_and_seasonal(series, top_n=top_n)

    # Plot the top_n periods and seasonal values
    for (
        period,
        seasonal,
        trend_strength,
        seasonal_strength,
        residual_strength,
    ) in top_periods_and_seasonal:
        print(
            f"Period: {period}, Seasonal: {seasonal}, Trend Strength: {trend_strength:.2f}, Seasonal Strength: {seasonal_strength:.2f}, Residual Strength: {residual_strength:.2f}"
        )
        if plot:
            apply_stl_decomposition_with_plots(
                series, period, seasonal, f"Period: {period}, Seasonal: {seasonal}"
            )

    return top_periods_and_seasonal

In [7]:
# Iterate over all columns except 'lead_hour' and analyze each one
results = {}
for column in df.columns:
    if column != "lead_hour" and column == "power":
        print(f"Analyzing column: {column}")
        specified_series = df[column]
        top_periods_and_seasonal = analyze_series(specified_series, top_n=10)
        results[column] = top_periods_and_seasonal

        # Print the results for each column
        for (
            period,
            seasonal,
            trend_strength,
            seasonal_strength,
            residual_strength,
        ) in top_periods_and_seasonal:
            print(
                f"Column: {column}, Period: {period}, Seasonal: {seasonal}, Trend Strength: {trend_strength:.2f}, Seasonal Strength: {seasonal_strength:.2f}, Residual Strength: {residual_strength:.2f}"
            )

Analyzing column: power


  series = series.resample("15T").mean()


Period: 4, Seasonal: 15, Trend Strength: 1.00, Seasonal Strength: 0.00, Residual Strength: 0.00
Period: 4, Seasonal: 31, Trend Strength: 1.00, Seasonal Strength: 0.00, Residual Strength: 0.00
Period: 4, Seasonal: 7, Trend Strength: 1.00, Seasonal Strength: 0.00, Residual Strength: 0.00
Period: 8, Seasonal: 31, Trend Strength: 0.99, Seasonal Strength: 0.00, Residual Strength: 0.01
Period: 8, Seasonal: 7, Trend Strength: 0.99, Seasonal Strength: 0.00, Residual Strength: 0.01
Period: 8, Seasonal: 15, Trend Strength: 0.99, Seasonal Strength: 0.00, Residual Strength: 0.01
Period: 16, Seasonal: 7, Trend Strength: 0.97, Seasonal Strength: 0.01, Residual Strength: 0.03
Period: 16, Seasonal: 15, Trend Strength: 0.97, Seasonal Strength: 0.00, Residual Strength: 0.03
Period: 16, Seasonal: 31, Trend Strength: 0.97, Seasonal Strength: 0.00, Residual Strength: 0.03
Period: 32, Seasonal: 31, Trend Strength: 0.94, Seasonal Strength: 0.01, Residual Strength: 0.06
Column: power, Period: 4, Seasonal: 15,