In [27]:
import os
import getpass
import numpy as np
import pandas as pd
from time import time
from functools import reduce
import matplotlib.pyplot as plt

# Sklearn
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Own lasso Liberay


# PCR - Principal component regression

## Data processing


In [31]:
# Gå et niveau op og naviger til mappen "Data"
os.chdir('../Data')

### 1. Load data

In [34]:
# Import features
file_path = 'Merged_dataset.csv'
df = pd.read_csv("merged_dataset.csv", sep=";", parse_dates=["Date"])
df.set_index("Date", inplace=True)
df.shape

(156, 184)

In [36]:
y = df.iloc[:, 0]   # første kolonne = inflation
X = df.iloc[:, 1:]  # resten = forklarende variable

# Tjek dimensioner
print("y shape:", y.shape)
print("X shape:", X.shape)

y shape: (156,)
X shape: (156, 183)


### 2. Imputer manglende værdier

In [39]:
imputer = SimpleImputer(strategy="median")  # Eller "median"

X_imputed = imputer.fit_transform(X)  # Imputer NaN i hele datasættet

# Konverter tilbage til DataFrame med kolonnenavne
X = pd.DataFrame(X_imputed, columns=X.columns, index = df.index)

### 3. Standardiser data 

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Simple PCR

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
# PCA uden at specificere antal komponenter
pca = PCA().fit(X_train)

# Kumulativ forklaring
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Vælg hvor mange komponenter der skal til for fx 95% forklaring
n_component = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Valgte {n_component} komponenter for at forklare mindst 95% af variansen")


Valgte 16 komponenter for at forklare mindst 95% af variansen


In [79]:
# 1. Definér antal hovedkomponenter (du kan vælge fx 10 eller bruge forklaring)
n_components = n_component

# 2. Lav PCR pipeline
pcr_model = Pipeline([
    ("pca", PCA(n_components=n_components)),
    ("reg", LinearRegression())
])

# 3. Træn modellen
pcr_model.fit(X_train, y_train)

In [81]:
# Forudsig
y_pred = pcr_model.predict(X_test)

# Evaluer
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.2f}")

MSE: 25667226544477196.00
R²: -7734868040725910.00




In [168]:
# 1. Træk ud delene fra pcr_model
pca = pcr_model.named_steps["pca"]
reg = pcr_model.named_steps["reg"]

# 2. Beregn "vigtighed" af hver original feature
# → Multiplicer PCA-komponentvægtninger (loadings) med regressionens koefficienter
# Dette transformerer koefficienter tilbage til original feature space

# shape: (n_features,)
pseudo_importance = np.dot(pca.components_.T, reg.coef_)

# 3. Pak det i en DataFrame
importance_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": np.abs(pseudo_importance)  # evt. brug abs for styrke
}).sort_values(by="Importance", ascending=False)

# 4. Vis top 10
importance_df.head(10)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

# Simple Forecast

In [71]:
# Parametre
forecast_horizons = 12
n_components = 10  # <- hvor mange PCA-komponenter du vil bruge

pcr_models = {}

# Træn én model per horisont
for h in range(forecast_horizons):
    print(f"\n=== Horizon {h+1} ===")

    y_shifted = y.shift(-h).dropna()
    X_train = X_scaled[:len(y_shifted)]  # Brug skaleret X
    y_train = y_shifted

    # Byg PCR pipeline
    model = Pipeline([
        ("pca", PCA(n_components=n_components)),
        ("reg", LinearRegression())
    ])

    model.fit(X_train, y_train)
    pcr_models[h] = model

    print(f"Antal træningsobservationer: {len(y_train)}")
    print(f"PCA-komponenter brugt: {n_components}")

# Forudsig fra sidste observation
latest_data_df = X.iloc[[-1]]
latest_data_scaled = scaler.transform(latest_data_df)

pcr_forecasts = {}

for h in range(forecast_horizons):
    forecast = pcr_models[h].predict(latest_data_scaled)
    pcr_forecasts[h] = forecast[0]


=== Horizon 1 ===
Antal træningsobservationer: 156
PCA-komponenter brugt: 10

=== Horizon 2 ===
Antal træningsobservationer: 155
PCA-komponenter brugt: 10

=== Horizon 3 ===
Antal træningsobservationer: 154
PCA-komponenter brugt: 10

=== Horizon 4 ===
Antal træningsobservationer: 153
PCA-komponenter brugt: 10

=== Horizon 5 ===
Antal træningsobservationer: 152
PCA-komponenter brugt: 10

=== Horizon 6 ===
Antal træningsobservationer: 151
PCA-komponenter brugt: 10

=== Horizon 7 ===
Antal træningsobservationer: 150
PCA-komponenter brugt: 10

=== Horizon 8 ===
Antal træningsobservationer: 149
PCA-komponenter brugt: 10

=== Horizon 9 ===
Antal træningsobservationer: 148
PCA-komponenter brugt: 10

=== Horizon 10 ===
Antal træningsobservationer: 147
PCA-komponenter brugt: 10

=== Horizon 11 ===
Antal træningsobservationer: 146
PCA-komponenter brugt: 10

=== Horizon 12 ===
Antal træningsobservationer: 145
PCA-komponenter brugt: 10


In [73]:
pcr_forecasts

{0: 1.5535031951025622,
 1: 1.3920233485029356,
 2: 1.3061506173188564,
 3: 1.0662776900805242,
 4: 0.8363522177333527,
 5: 0.65610017293495,
 6: 0.5807405915986608,
 7: 0.6245245022053376,
 8: 0.6992221757379105,
 9: 0.8414234531595552,
 10: 0.9170721009413529,
 11: 1.1764592920092498}

# Forsøg 2

In [150]:
def PCR_Forecaster(X, y, forecast_horizon, last_observation_date, scaler, variance_threshold=0.95, verbose=True):
    """
    Forecast inflation using Principal Component Regression (PCR) per forecast horizon.
    
    Args:
        X: DataFrame of predictors
        y: Series of target
        forecast_horizon: int, number of months ahead
        last_observation_date: str or Timestamp, point to forecast from
        scaler: fitted StandardScaler on training data
        variance_threshold: float, % of explained variance to retain (default: 0.95)
        verbose: print training info
    """

    # Trim data til real-time
    X = X.loc[:last_observation_date]
    y = y.loc[:last_observation_date]

    X_scaled = scaler.transform(X)
    
    pcr_models = {}

    for h in range(forecast_horizon):
        if verbose:
            print(f"\n=== Horisont h={h} ===")

        y_shifted = y.shift(-h).dropna()
        X_train = X_scaled[:len(y_shifted)]
        y_train = y_shifted

        # Automatisk valg af antal komponenter
        pca = PCA().fit(X_train)
        cumvar = np.cumsum(pca.explained_variance_ratio_)
        n_components = np.argmax(cumvar >= variance_threshold) + 1

        if verbose:
            print(f"Forklaret varians (k={n_components}): {cumvar[n_components-1]:.2%}")
            
        # PCR pipeline
        model = Pipeline([
            ("pca", PCA(n_components=n_components)),
            ("reg", LinearRegression())
        ])

        model.fit(X_train, y_train)
        pcr_models[h] = model

        if verbose:
            print(f"Træningsobs: {len(y_train)} | Komponenter: {n_components}")

    # Forudsig fra X_t
    try:
        X_t = X.loc[[last_observation_date]]
    except KeyError:
        X_t = X.iloc[[-1]]
        if verbose:
            print(f"Dato {last_observation_date} ikke i X, bruger {X.index[-1]} i stedet.")

    X_t_scaled = scaler.transform(X_t)

    # Forudsig
    pcr_forecasts = {}
    for h in range(forecast_horizon):
        forecast = pcr_models[h].predict(X_t_scaled)
        pcr_forecasts[h] = forecast[0]

    # Lav datoer
    start_date = pd.to_datetime(last_observation_date) + pd.DateOffset(months=1)
    forecast_dates = [start_date + pd.DateOffset(months=h) for h in pcr_forecasts.keys()]

    #  Print datoer for de forudsagte måneder
    print("\nForudsagte måneder:")
    for date in forecast_dates:
        print(date.strftime("%Y-%m"))
        
    forecast_df = pd.DataFrame({
        "Dato": forecast_dates,
        "Inflationsforecast": list(pcr_forecasts.values()),
        "Horizon": [h + 1 for h in pcr_forecasts.keys()]
    })

    return forecast_df


In [164]:
# Lav forecast fra sidste kendte måned
forecast_df = PCR_Forecaster(
    X=X,
    y=y,
    forecast_horizon=12,
    last_observation_date="2024-12-01",
    scaler=scaler,
    variance_threshold=0.99
)



=== Horisont h=0 ===
Forklaret varians (k=35): 99.01%
Træningsobs: 156 | Komponenter: 35

=== Horisont h=1 ===
Forklaret varians (k=35): 99.01%
Træningsobs: 155 | Komponenter: 35

=== Horisont h=2 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 154 | Komponenter: 35

=== Horisont h=3 ===
Forklaret varians (k=35): 99.07%
Træningsobs: 153 | Komponenter: 35

=== Horisont h=4 ===
Forklaret varians (k=35): 99.07%
Træningsobs: 152 | Komponenter: 35

=== Horisont h=5 ===
Forklaret varians (k=35): 99.07%
Træningsobs: 151 | Komponenter: 35

=== Horisont h=6 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 150 | Komponenter: 35

=== Horisont h=7 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 149 | Komponenter: 35

=== Horisont h=8 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 148 | Komponenter: 35

=== Horisont h=9 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 147 | Komponenter: 35

=== Horisont h=10 ===
Forklaret varians (k=35): 99.06%
Træningsobs: 146 | Komponenter: 35

=== Hori

In [166]:
forecast_df

Unnamed: 0,Dato,Inflationsforecast,Horizon
0,2025-01-01,1.891868,1
1,2025-02-01,2.364263,2
2,2025-03-01,2.7813,3
3,2025-04-01,2.760473,4
4,2025-05-01,2.659614,5
5,2025-06-01,2.698283,6
6,2025-07-01,2.573388,7
7,2025-08-01,2.7997,8
8,2025-09-01,2.77536,9
9,2025-10-01,2.812895,10
