In [7]:
import pandas as pd
import numpy as np
import itertools
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import warnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from rich.console import Console
from rich.progress import Progress

# Suppress warnings
warnings.filterwarnings("ignore")
console = Console()


In [8]:
def load_data(filepath):
    """
    Load the dataset from a CSV file.
    """
    df = pd.read_csv(filepath)
    df["Period"] = pd.to_datetime(df["Period"], format="%m/%d/%Y")
    return df

In [9]:
def preprocess_data(df, company):
    """
    Preprocesses data for a given company.
    - Filters the dataset for the selected company.
    - Sorts values by the period.
    - Converts the Revenue column to numeric format to handle any non-numeric values.
      This ensures that calculations such as forecasting and error measurements work correctly.
    """
    company_df = df[df["Company"] == company].copy()
    company_df = company_df.sort_values(by="Period")
    company_df.set_index("Period", inplace=True)
    company_df["Revenue"] = pd.to_numeric(company_df["Revenue"], errors="coerce")
    return company_df["Revenue"].dropna()

In [10]:
def train_sarima(train, test, best_cfg):
    """
    Train the best SARIMA model and evaluate its performance.

    Args:
        train (pd.Series): Training data.
        test (pd.Series): Test data.
        best_cfg (tuple): Best SARIMA hyperparameters.

    Returns:
        dict: Model performance metrics.
    """
    best_p, best_d, best_q, best_P, best_D, best_Q = best_cfg
    S = 4  # Quarterly seasonality

    # Train SARIMA model with best configuration
    optimized_model = SARIMAX(
        train, order=(best_p, best_d, best_q),
        seasonal_order=(best_P, best_D, best_Q, S),
        enforce_stationarity=False, enforce_invertibility=False
    )
    optimized_model_fit = optimized_model.fit(disp=False)

    # Make predictions
    optimized_predictions = optimized_model_fit.forecast(steps=len(test))

    # Compute performance metrics
    optimized_mae = mean_absolute_error(test, optimized_predictions)
    optimized_rmse = np.sqrt(mean_squared_error(test, optimized_predictions))
    optimized_mape = mean_absolute_percentage_error(test, optimized_predictions)
    optimized_r2 = r2_score(test, optimized_predictions)

    # Compute MASE (Mean Absolute Scaled Error)
    naive_forecast = test.shift(1).dropna()
    test_aligned = test.loc[naive_forecast.index]  # Align lengths
    mase_denominator = mean_absolute_error(test_aligned, naive_forecast)
    optimized_mase = optimized_mae / mase_denominator if mase_denominator != 0 else np.nan

    return {
        "Best SARIMA Order": best_cfg,
        "MAE": optimized_mae,
        "RMSE": optimized_rmse,
        "MAPE": optimized_mape,
        "MASE": optimized_mase,
        "R² Score": optimized_r2
    }

def process_company(company, results):
    """
    Processes a single company's data:
    - Preprocess the data.
    - Find the best SARIMA model.
    - Train and evaluate the model.
    - Store the results.
    """
    console.print(f"🚀 [bold blue]Processing {company}...[/bold blue]")
    series = preprocess_data(df, company)

    if len(series) < 10:
        console.print(f"⚠️ [yellow]Skipping {company} due to insufficient data points.[/yellow]")
        return results

    # Split data into train/test
    train_size = int(len(series) * 0.8)
    train, test = series[:train_size], series[train_size:]

    # Find best SARIMA order
    best_cfg = find_best_sarima(train)
    console.print(f"📊 [bold green]Best SARIMA order for {company}: {best_cfg}[/bold green]")

    # Train and evaluate SARIMA model
    results[company] = train_sarima(train, test, best_cfg)

    return results


def find_best_sarima(train):
    """
    Find the best SARIMA model using grid search.
    
    Parameter Ranges:
    - `p_values` (Auto-Regressive terms): Typically, 0-2 is sufficient to capture most patterns.
    - `d_values` (Differencing terms): 0-1 is generally enough for stationarity.
    - `q_values` (Moving Average terms): 0-2 captures most real-world data behavior.
    - `P_values` (Seasonal AR terms): Usually, 0-1 suffices for seasonal effects.
    - `D_values` (Seasonal Differencing): Typically, 0-1 is sufficient for removing seasonal trends.
    - `Q_values` (Seasonal MA terms): 0-1 captures most seasonal variations.
    - `S`: Set to 4, assuming quarterly seasonality (adjust for different datasets).
    """
    p_values = range(0, 3)
    d_values = range(0, 2)
    q_values = range(0, 3)
    P_values = range(0, 2)
    D_values = range(0, 2)
    Q_values = range(0, 2)
    S = 4  # Quarterly seasonality

    best_score, best_cfg = float("inf"), None
    total_combinations = len(p_values) * len(d_values) * len(q_values) * len(P_values) * len(D_values) * len(Q_values)

    console.print(f"🔍 [bold cyan]Searching best SARIMA model... ({total_combinations} combinations)[/bold cyan]")

    with Progress() as progress:
        task = progress.add_task("[green]Optimizing SARIMA...", total=total_combinations)

        for p, d, q, P, D, Q in itertools.product(p_values, d_values, q_values, P_values, D_values, Q_values):
            if q == Q:
                continue
            try:
                model = SARIMAX(train, order=(p, d, q), seasonal_order=(P, D, Q, S),
                                enforce_stationarity=False, enforce_invertibility=False)
                model_fit = model.fit(disp=False)
                predictions = model_fit.forecast(steps=len(train))
                error = mean_absolute_error(train, predictions)
                if error < best_score:
                    best_score, best_cfg = error, (p, d, q, P, D, Q)
            except:
                continue
            progress.update(task, advance=1)

    console.print(f"✅ [bold green]Best SARIMA order found:[/bold green] {best_cfg} with MAE = {best_score:.2f}")
    return best_cfg

In [11]:
def main(filepath):
    """
    Main function to run the SARIMA model pipeline.
    
    Steps:
    1. Load the dataset.
    2. Iterate through each unique company in the dataset.
    3. Preprocess the data for each company.
    4. Optimize the SARIMA parameters using grid search.
    5. Validate the model using Walk-Forward Validation and Time-Series Cross-Validation.
    6. Store the results and display them as a dataframe.
    """
    global df
    df = load_data(filepath)
    results = {}
    for company in df["Company"].unique():
        results = process_company(company, results)
    results_df = pd.DataFrame.from_dict(results, orient='index')
    print(results_df)


In [12]:
# Run the script
main("../data/Top_12_German_Companies_Financial_Data.csv")


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

                      Best SARIMA Order           MAE          RMSE      MAPE  \
Volkswagen AG        (0, 0, 2, 0, 1, 0)  2.965775e+09  3.591599e+09  0.267049   
Siemens AG           (0, 1, 0, 0, 0, 1)  3.384833e+09  3.781294e+09  0.356424   
Allianz SE           (0, 1, 2, 0, 0, 1)  4.659838e+09  5.217301e+09  0.443738   
BMW AG               (2, 1, 1, 1, 0, 0)  3.908350e+09  4.430682e+09  0.398364   
BASF SE              (0, 1, 2, 1, 0, 0)  3.206611e+09  4.004590e+09  0.225905   
Deutsche Telekom AG  (2, 1, 1, 1, 0, 0)  4.798453e+09  5.774398e+09  0.618780   
Daimler AG           (2, 0, 2, 0, 1, 1)  4.064572e+09  4.735020e+09  0.505780   
SAP SE               (0, 1, 2, 1, 0, 0)  2.461732e+09  2.998258e+09  0.214557   
Bayer AG             (1, 1, 1, 0, 0, 0)  4.306329e+09  4.629554e+09  0.535126   
Deutsche Bank AG     (2, 0, 2, 1, 1, 1)  5.508653e+09  6.387944e+09  0.389402   
Porsche AG           (0, 1, 2, 1, 0, 1)  5.326442e+09  5.831061e+09  0.345923   
Merck KGaA           (1, 0, 