# Polynomial forecasting baseline
Simple baseline to forecast every numeric feature for the next three years using a polynomial regression on the `year` column. The notebook expects a table shaped like `data/df_pivot_filtered_numbers.csv` where one column contains the year (`rok` or `year`) and the remaining columns are the features to forecast.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

pd.set_option("display.float_format", lambda v: f"{v:0.3f}")

In [2]:
# Load the pivoted dataset (replace the path when the final dataset is ready)
data_path = Path("data/df_pivot_filtered_numbers.csv")
df_raw = pd.read_csv(data_path)

# Normalize the year column name
year_col = "year" if "year" in df_raw.columns else "rok"
if year_col not in df_raw.columns:
    raise ValueError("Year column not found; expected 'rok' or 'year'.")

df = df_raw.rename(columns={year_col: "year"}).sort_values("year").reset_index(drop=True)
print(f"Loaded {df.shape[0]} rows and {df.shape[1] - 1} features; year column = '{year_col}'.")
df.head()

Loaded 20 rows and 1110 features; year column = 'rok'.


Unnamed: 0,year,10.1_C,10.1_CF,10.1_DEPR,10.1_EN,10.1_GS,10.1_GS_I,10.1_INV,10.1_IO,10.1_IP,...,62.0_INV_to_STL,62.0_NWC_to_STL,62.0_Current_ratio_proxy,62.0_Quick_ratio_proxy,62.0_Intensywność inwestycyjna,62.0_CF_to_TC,62.0_OFE_to_GS,62.0_OFE_to_NP,62.0_PPO_to_GS_I,62.0_Udział firm nierentownych
0,2005,547.95,1554.29,668.35,929,32922.34,32460.46,1268.07,1138.03,183.26,...,0.087,0.545,1.685,1.598,0.025,0.143,0.012,0.162,,0.186
1,2006,710.61,1620.28,737.54,924,34313.98,33925.78,1239.35,1275.99,181.88,...,0.104,0.627,1.753,1.649,0.028,0.14,0.019,0.278,0.013,0.173
2,2007,643.22,1865.55,802.25,938,38517.27,38022.65,1480.76,1426.84,206.93,...,0.109,0.773,1.95,1.84,0.031,0.131,0.009,0.128,0.015,0.187
3,2008,667.13,1777.05,833.99,882,40249.57,39496.03,1614.03,1254.46,281.76,...,0.091,0.56,1.775,1.684,0.026,0.154,0.018,0.21,0.014,0.23
4,2009,786.08,2069.04,825.08,896,43931.09,43247.43,1595.45,874.28,232.9,...,0.101,0.636,1.883,1.782,0.022,0.148,0.009,0.102,0.017,0.215


In [3]:
# Basic cleaning: interpolate missing values per feature along the time axis
features = (
    df.set_index("year")
    .sort_index()
    .pipe(lambda d: d.interpolate(method="linear", limit_direction="both"))
)
if features.isna().any().any():
    features = features.fillna(method="ffill").fillna(method="bfill")

df_clean = features.reset_index()
print(f"After interpolation: {df_clean.shape[0]} rows, {df_clean.shape[1] - 1} features. Remaining NaN: {df_clean.isna().sum().sum()}")
df_clean.head()

After interpolation: 20 rows, 1110 features. Remaining NaN: 0


Unnamed: 0,year,10.1_C,10.1_CF,10.1_DEPR,10.1_EN,10.1_GS,10.1_GS_I,10.1_INV,10.1_IO,10.1_IP,...,62.0_INV_to_STL,62.0_NWC_to_STL,62.0_Current_ratio_proxy,62.0_Quick_ratio_proxy,62.0_Intensywność inwestycyjna,62.0_CF_to_TC,62.0_OFE_to_GS,62.0_OFE_to_NP,62.0_PPO_to_GS_I,62.0_Udział firm nierentownych
0,2005,547.95,1554.29,668.35,929,32922.34,32460.46,1268.07,1138.03,183.26,...,0.087,0.545,1.685,1.598,0.025,0.143,0.012,0.162,0.013,0.186
1,2006,710.61,1620.28,737.54,924,34313.98,33925.78,1239.35,1275.99,181.88,...,0.104,0.627,1.753,1.649,0.028,0.14,0.019,0.278,0.013,0.173
2,2007,643.22,1865.55,802.25,938,38517.27,38022.65,1480.76,1426.84,206.93,...,0.109,0.773,1.95,1.84,0.031,0.131,0.009,0.128,0.015,0.187
3,2008,667.13,1777.05,833.99,882,40249.57,39496.03,1614.03,1254.46,281.76,...,0.091,0.56,1.775,1.684,0.026,0.154,0.018,0.21,0.014,0.23
4,2009,786.08,2069.04,825.08,896,43931.09,43247.43,1595.45,874.28,232.9,...,0.101,0.636,1.883,1.782,0.022,0.148,0.009,0.102,0.017,0.215


In [4]:
# Train / test split: train on 2005-2021, test on 2022-2024
train_years = (2005, 2021)
test_years = (2022, 2024)

train = df_clean[df_clean["year"].between(*train_years)]
test = df_clean[df_clean["year"].between(*test_years)]

X_train, y_train = train[["year"]], train.drop(columns=["year"])
X_test, y_test = test[["year"]], test.drop(columns=["year"])

print(f"Train: {X_train.shape[0]} rows, Test: {X_test.shape[0]} rows, Targets: {y_train.shape[1]} features.")

Train: 17 rows, Test: 3 rows, Targets: 1110 features.


In [5]:
# Fit a polynomial regression (multi-output)
degree = 3  # adjust if you want more/less curve flexibility
poly_reg = make_pipeline(
    PolynomialFeatures(degree=degree, include_bias=False),
    LinearRegression()
)

poly_reg.fit(X_train, y_train)
print("Model fitted.")

Model fitted.


In [6]:
# Evaluate on the hold-out years (2022-2024)
test_pred = pd.DataFrame(
    poly_reg.predict(X_test),
    columns=y_train.columns,
    index=test["year"],
)

mae_per_feature = (test_pred - y_test).abs().mean()
overall_mae = mae_per_feature.mean()
overall_r2 = r2_score(y_test, test_pred, multioutput="variance_weighted")

print(f"Overall MAE across features: {overall_mae:0.4f}")
print(f"Variance-weighted R^2: {overall_r2:0.4f}")
mae_per_feature.sort_values().head()

Overall MAE across features: nan
Variance-weighted R^2: -1908.5933


10.1_C      NaN
10.1_CF     NaN
10.1_DEPR   NaN
10.1_EN     NaN
10.1_GS     NaN
dtype: float64

In [7]:
# Forecast the next 3 years beyond the latest available year
horizon = 3
last_year = int(df_clean["year"].max())
future_years = np.arange(last_year + 1, last_year + horizon + 1)
future_X = pd.DataFrame({"year": future_years})

future_pred = pd.DataFrame(
    poly_reg.predict(future_X),
    columns=y_train.columns,
    index=future_years,
)

print("Forecast for next years:")
future_pred.head()

Forecast for next years:


Unnamed: 0,10.1_C,10.1_CF,10.1_DEPR,10.1_EN,10.1_GS,10.1_GS_I,10.1_INV,10.1_IO,10.1_IP,10.1_LTC,...,62.0_INV_to_STL,62.0_NWC_to_STL,62.0_Current_ratio_proxy,62.0_Quick_ratio_proxy,62.0_Intensywność inwestycyjna,62.0_CF_to_TC,62.0_OFE_to_GS,62.0_OFE_to_NP,62.0_PPO_to_GS_I,62.0_Udział firm nierentownych
2025,1745.921,6487.306,2068.707,366.875,78801.725,76458.118,3275.951,1586.387,205.242,2573.202,...,0.126,1.039,1.998,1.872,0.02,0.199,0.005,-0.093,0.012,0.164
2026,1641.867,7045.558,2258.874,311.128,77600.685,74908.448,3099.965,1395.245,204.169,2072.981,...,0.149,1.056,1.95,1.801,0.022,0.227,0.005,-0.156,0.012,0.159
2027,1494.455,7652.004,2470.263,252.76,75752.877,72662.987,2864.468,1148.763,205.012,1431.941,...,0.178,1.066,1.89,1.712,0.024,0.26,0.005,-0.232,0.012,0.155


In [8]:
# Save forecasts for downstream use
output_path = Path("data/polynomial_forecast_next3.csv")
future_pred.to_csv(output_path, index_label="year")
print(f"Saved: {output_path.resolve()}")

Saved: C:\Users\wojte\Desktop\Hacknation2025\data\polynomial_forecast_next3.csv
