# Polynomial forecasting baseline
Simple baseline to forecast every numeric feature for the next three years using a polynomial regression on the `year` column. The notebook expects a table shaped like `data/df_pivot_filtered_numbers.csv` where one column contains the year (`rok` or `year`) and the remaining columns are the features to forecast.

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

pd.set_option("display.float_format", lambda v: f"{v:0.3f}")

In [None]:
# Load the dataset from Excel (DATA_PKD_SPECIFIC.xlsx)
data_path = Path("data/DATA_PKD_SPECIFIC.xlsx")
df_raw = pd.read_excel(data_path, sheet_name="Arkusz1")

# Normalize/repair the year column (handles 'year', 'rok' in first column)
possible_year_cols = ["year", "rok"]
year_col = next((c for c in possible_year_cols if c in df_raw.columns), None)

if year_col is None:
    first_col = df_raw.columns[0]
    if pd.api.types.is_numeric_dtype(df_raw[first_col]):
        year_col = first_col

if year_col is None:
    raise ValueError("Year column not found; expected a 'year'/'rok' column or an unnamed first column with years.")

df = (
    df_raw.rename(columns={year_col: "year"})
    .assign(year=lambda d: pd.to_numeric(d["year"], errors="coerce"))
    .sort_values("year")
    .reset_index(drop=True)
)

print(f"Loaded {df.shape[0]} rows and {df.shape[1] - 1} features from '{data_path.name}'; year column = '{year_col}'.")
df.head()


In [None]:
# Basic cleaning: fill missing values using a column-wise linear trend over years
features = df.set_index("year").sort_index().apply(pd.to_numeric, errors="coerce")
years = features.index.to_numpy().reshape(-1, 1)

filled = features.copy()
for col in features.columns:
    series = features[col]
    if not series.isna().any():
        continue
    mask = series.notna()
    # If we have at least two data points, fit a linear model year -> value; otherwise fallback to forward/backfill
    if mask.sum() < 2:
        filled[col] = series.fillna(method="ffill").fillna(method="bfill")
        continue
    model = LinearRegression()
    model.fit(years[mask], series[mask].values)
    filled.loc[~mask, col] = model.predict(years[~mask])

df_clean = filled.reset_index()
print(f"After linear-trend filling: {df_clean.shape[0]} rows, {df_clean.shape[1] - 1} features. Remaining NaN: {df_clean.isna().sum().sum()}")
df_clean.head()


In [None]:
# Train / test split: train on 2005-2021, test on 2022-2024
train_years = (2005, 2021)
test_years = (2022, 2024)

train = df_clean[df_clean["year"].between(*train_years)]
test = df_clean[df_clean["year"].between(*test_years)]

X_train, y_train = train[["year"]], train.drop(columns=["year"])
X_test, y_test = test[["year"]], test.drop(columns=["year"])

print(f"Train: {X_train.shape[0]} rows, Test: {X_test.shape[0]} rows, Targets: {y_train.shape[1]} features.")

In [None]:
# Fit a polynomial regression (multi-output)
degree = 3  # adjust if you want more/less curve flexibility
poly_reg = make_pipeline(
    PolynomialFeatures(degree=degree, include_bias=False),
    LinearRegression()
)

poly_reg.fit(X_train, y_train)
print("Model fitted.")

In [None]:
# Evaluate on the hold-out years (2022-2024)
test_pred = pd.DataFrame(
    poly_reg.predict(X_test),
    columns=y_train.columns,
    index=test["year"],
)

mae_per_feature = (test_pred - y_test).abs().mean()
overall_mae = mae_per_feature.mean()
overall_r2 = r2_score(y_test, test_pred, multioutput="variance_weighted")

print(f"Overall MAE across features: {overall_mae:0.4f}")
print(f"Variance-weighted R^2: {overall_r2:0.4f}")
mae_per_feature.sort_values().head()

In [None]:
# Forecast the next 3 years beyond the latest available year
horizon = 3
last_year = int(df_clean["year"].max())
future_years = np.arange(last_year + 1, last_year + horizon + 1)
future_X = pd.DataFrame({"year": future_years})

future_pred = pd.DataFrame(
    poly_reg.predict(future_X),
    columns=y_train.columns,
    index=future_years,
)

print("Forecast for next years:")
future_pred.head()

In [None]:
# Save forecasts for downstream use
output_path = Path("data/polynomial_forecast_next3.csv")
future_pred.to_csv(output_path, index_label="year")
print(f"Saved: {output_path.resolve()}")