In [2]:
# libraries
import os
import getpass
import numpy as np
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt

# Introduktion 
Formålet med denne kode ...

# Funktioner

## Header

In [6]:
def standardize_dates(df, date_column):
    """
    Standardizes a date column to a common monthly format (YYYY-MM-01).
    
    Parameters:
    df (pd.DataFrame): The dataframe containing the date column.
    date_column (str): The name of the column containing date values.
    Returns:
    pd.DataFrame: The dataframe with a standardized date column.
    """
    
    # Make a copy of the dataframe to avoid modifying the original
    df = df.copy()
    
    # Convert to string in case of mixed formats
    df[date_column] = df[date_column].astype(str)
    
    # Handle daily format (YYYY-MM-DD) → Convert directly
    if df[date_column].str.match(r"^\d{4}-\d{2}-\d{2}$").all():
        df["Date"] = pd.to_datetime(df[date_column]).dt.to_period("M").dt.to_timestamp()

    # Handle monthly format (YYYY-MM) → Convert directly
    elif df[date_column].str.match(r"^\d{4}-\d{2}$").all():
        df["Date"] = pd.to_datetime(df[date_column] + "-01")
        
    # Handle monthly format (YYYYMM) → Convert directly
    elif df[date_column].str.match(r"^\d{4}\d{2}$").all():
        df["Date"] = pd.to_datetime(df[date_column] + "-01")
        
    # Handle daliy format (YYYYM0DD)
    elif df[date_column].str.match(r"^\d{4}M0\d{2}$").all():
        df["Date"] = pd.to_datetime(df[date_column].str[:4] + "-" + df[date_column].str[5:6] + "-" + df[date_column].str[-2:])

    # Handle monthly format (YYYYMM) → Convert correctly
    elif df[date_column].str.match(r"^\d{4}M\d{2}$").all():
        df["Date"] = pd.to_datetime(df[date_column].str[:4] + "-" + df[date_column].str[5:] + "-01")
        
    # Handle yearly format (YYYY) → Convert to first day of the year
    elif df[date_column].str.match(r"^\d{4}$").all():
        df["Date"] = pd.to_datetime(df[date_column] + "-01-01")
        
    # Handle format 'YYYYMXXDXX' (e.g., '2001M01D02' → '2001-01-02')
    elif df[date_column].str.match(r"^\d{4}M\d{2}D\d{2}$").all():
        df["Date"] = pd.to_datetime(
            df[date_column].str[:4] + "-" +  # Extract year
            df[date_column].str[5:7] + "-" +  # Extract month (skip 'M')
            df[date_column].str[-2:]  # Extract day (skip 'D')
        )

    # Handle quarterly format (YYYYKX) → Convert to first month of quarter
    elif df[date_column].str.match(r"^\d{4}K[1-4]$").all():
        df["Date"] = pd.to_datetime(
            df[date_column].str[:4] + "-" + 
            df[date_column].str[5].replace({"1": "01", "2": "04", "3": "07", "4": "10"}) + "-01"
        )
        
    elif df[date_column].str.match(r"^\d{4}K[1-4]$").all():
        quarter_map = {"1": "01", "2": "04", "3": "07", "4": "10"}
        df["Date"] = pd.to_datetime(
            df[date_column].str[:4] + "-" +
            df[date_column].str[5].map(quarter_map) + "-01"
        )

    
    else:
        raise ValueError("Unrecognized date format. Ensure the column is in daily (YYYY-MM-DD), monthly (YYYY-MM), or quarterly (YYYYKX) format.")
    
    # Drop the original date column and set the new Date column as index
    df = df.drop(columns=[date_column]).set_index("Date")
    
    return df

def save_merged_data(df, filename="merged_dataset.csv"):
    """
    Gemmer en dataframe som en CSV-fil.
    
    Parameters:
    df (pd.DataFrame): DataFrame der skal gemmes.
    filename (str): Navnet på den gemte CSV-fil (standard: "merged_dataset.csv").
    """
    df.to_csv(filename, sep=";", na_rep="NaN", index=True, encoding="utf-8")
    print(f"✅ Filen '{filename}' er gemt!")

    
def check_missing_values(df):
    missing_values = df.isna().sum()  # Antal NaN per kolonne
    total_missing = missing_values.sum()  # Samlet antal NaN
    missing_percent = (missing_values / len(df)) * 100  # Procentvis manglende værdier
    
    # Udlad for nu
    missing_df = pd.DataFrame({
        "Missing Values": missing_values,
        "Percentage (%)": missing_percent
    })
    
    print(f"Total missing values in dataset: {total_missing}")
    return 

## Import headers

In [8]:
# Gå et niveau op og naviger til mappen "Data"
os.chdir('Data')

# Læs headeren fra "header.csv"
header_path = 'header.csv'
header = pd.read_csv(header_path, sep=';', encoding='ISO-8859-1').columns.tolist()

## Niveau 0

In [10]:
# Åbn CSV-filen "niveau_0" som er semikolon-separeret og med encoding 'ISO-8859-1'
file_path = 'niveau_0.csv'
niveau_0 = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1',header=0)

niveau_0.replace("..", pd.NA, inplace=True)

# Kør standardiseringsfunktionen
df_niveau_0 = standardize_dates(niveau_0, "date")

# Se det nye dataset
df_niveau_0

Unnamed: 0_level_0,00 Forbrugerprisindekset i alt
Date,Unnamed: 1_level_1
2001-01-01,76.7
2001-02-01,77.3
2001-03-01,77.7
2001-04-01,78.0
2001-05-01,78.4
...,...
2024-10-01,119.6
2024-11-01,119.2
2024-12-01,118.9
2025-01-01,119.6


In [11]:
check_missing_values(df_niveau_0)

Total missing values in dataset: 0


## Niveau 1

In [13]:
# Åbn CSV-filen "niveau_1" som er semikolon-separeret og med encoding 'ISO-8859-1'
file_path = 'niveau_1_new.csv'
niveau_1 = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0).dropna()
niveau_1.replace("..", pd.NA, inplace=True)


In [14]:
# Kør standardiseringsfunktionen
df_niveau_1 = standardize_dates(niveau_1, "date")
df_niveau_1

Unnamed: 0_level_0,01 Fødevarer og ikke-alkoholiske drikkevarer,02. Alkoholiske drikkevarer og tobak,03. Beklædning og fodtøj,"04. Boligbenyttelse, elektricitet og opvarmning","05. Møbler, husholdningsudstyr og husholdningstjenester",06. Sundhed,07. Transport,08. Kommunikation,09. Fritid og kultur,10. Uddannelse,11. Restauranter og hoteller,12. Andre varer og tjenester
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001-01-01,75.2,76.3,88.9,69.4,84.7,81.2,77.7,130.4,96.7,49.8,70.4,68.1
2001-02-01,75.7,76.3,89.7,70.2,85.2,81.2,78.5,130.2,96.8,49.8,70.7,68.2
2001-03-01,76.2,76.4,97.4,70.1,85.5,81.2,78.4,129.7,96.9,49.8,70.8,68.5
2001-04-01,77.0,76.4,100.0,70.2,85.7,80.0,79.0,129.4,97.4,49.8,70.9,68.9
2001-05-01,77.5,76.5,100.1,70.2,85.8,80.0,80.0,129.1,98.3,49.8,71.2,69.4
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-01,132.4,135.0,99.7,120.2,101.6,113.8,118.7,84.9,116.3,134.6,133.7,120.1
2024-11-01,132.4,134.8,98.6,121.0,103.7,112.4,117.7,84.3,115.0,134.6,128.6,119.9
2024-12-01,129.7,132.5,98.5,120.7,104.1,113.8,117.6,85.9,113.8,134.6,131.6,119.9
2025-01-01,132.6,136.8,91.8,121.3,102.7,113.7,119.1,85.0,114.4,135.2,131.7,122.5


In [15]:
check_missing_values(df_niveau_1)

Total missing values in dataset: 0


## Niveau 2

In [17]:
# First approch, I think it can be delete at some point...
""" # Åbn CSV-filen "niveau_2" som er semikolon-separeret og med encoding 'ISO-8859-1'
file_path = 'niveau_2_new.csv'
niveau_2 = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0, decimal=",")
niveau_2.replace("..", pd.NA, inplace=True)
niveau_2.head()

# Kør standardiseringsfunktionen
df_niveau_2 = standardize_dates(niveau_2, "date")
df_niveau_2

# Check Missing values
check_missing_values(df_niveau_2)

# Fjern kolonnen "01.1 Fødevarer"
#df_niveau_2 = df_niveau_2.drop(columns=["01.1 Fødevarer"])


# Print all for each columns the sum of NaNs
print(df_niveau_2.isna().sum())


# Hvis en række har mere en 70 pct NaN slettes den
# Ændre til 100pct
df_niveau_2.dropna(axis=1, thresh=len(df_niveau_2) * 1, inplace=True) """ 

' # Åbn CSV-filen "niveau_2" som er semikolon-separeret og med encoding \'ISO-8859-1\'\nfile_path = \'niveau_2_new.csv\'\nniveau_2 = pd.read_csv(file_path, sep=\';\', encoding=\'ISO-8859-1\', header=0, decimal=",")\nniveau_2.replace("..", pd.NA, inplace=True)\nniveau_2.head()\n\n# Kør standardiseringsfunktionen\ndf_niveau_2 = standardize_dates(niveau_2, "date")\ndf_niveau_2\n\n# Check Missing values\ncheck_missing_values(df_niveau_2)\n\n# Fjern kolonnen "01.1 Fødevarer"\n#df_niveau_2 = df_niveau_2.drop(columns=["01.1 Fødevarer"])\n\n\n# Print all for each columns the sum of NaNs\nprint(df_niveau_2.isna().sum())\n\n\n# Hvis en række har mere en 70 pct NaN slettes den\n# Ændre til 100pct\ndf_niveau_2.dropna(axis=1, thresh=len(df_niveau_2) * 1, inplace=True) '

In [18]:
file_path = 'niveau_2.xlsx'
niveau_2 = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_niveau_2 = standardize_dates(niveau_2, "date")

In [19]:
#df_niveau_2 = niveau_2.resample("MS").ffill()
df_niveau_2  

Unnamed: 0_level_0,01.1 Fødevarer,01.2 Ikke-alkoholiske drikkevarer,02.1 Alkoholiske drikkevarer,02.2 Tobak,03.1 Beklædning,03.2 Fodtøj,04.1 Faktisk husleje,04.2 Beregnet lejeværdi af bolig,04.3 Vedligeholdelse og reparation af bolig,04.4 Vandforsyning og andre tjenester i forbindelse med boligen,...,10.4 Videregående uddannelse,10.5 Undervisning uden for niveau,"11.1 Restauranter, cafeer og kantiner mv.",11.2 Overnatningsfaciliteter,12.1 Personlig pleje,12.3 Andre personlige effekter,12.4 Daginstitutioner og social forsorg,12.5 Forsikring,12.6 Finansielle tjenester,12.7 Andre tjenester
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-01,74.7,79.4,88.9,66.8,96.6,61.5,69.7,69.6,73.9,60.2,...,,,69.7,82.0,78.4,72.3,70.6,62.1,62.3,66.1
2001-02-01,75.0,80.6,89.0,66.8,98.0,60.9,71.1,71.0,75.0,60.2,...,,,70.1,82.0,78.4,72.8,70.6,62.1,62.3,66.1
2001-03-01,75.6,81.1,89.2,66.8,107.8,62.6,71.1,71.0,75.1,60.2,...,,,70.2,82.0,79.1,73.9,70.6,62.1,62.3,66.4
2001-04-01,76.6,80.7,89.2,66.8,110.9,63.8,71.1,71.0,75.1,60.2,...,,,70.3,82.0,79.4,74.1,70.6,62.1,65.9,67.7
2001-05-01,77.1,80.6,89.3,66.8,110.9,63.9,71.1,71.0,75.5,60.2,...,,,70.4,85.6,79.5,74.5,70.6,63.5,66.2,67.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-01,134.1,119.5,114.1,157.0,95.8,115.8,115.2,115.6,133.7,125.8,...,127.7,131.6,131.5,142.2,103.7,101.9,130.4,123.4,127.4,124.8
2024-11-01,134.3,118.5,113.8,157.0,94.6,115.5,115.6,116.0,134.4,125.8,...,127.7,131.6,131.8,111.2,103.0,101.9,130.4,123.4,127.4,124.9
2024-12-01,131.7,115.4,110.2,157.3,94.6,114.6,115.6,116.0,134.8,125.8,...,127.7,131.6,132.0,127.5,102.7,102.5,130.4,123.4,127.9,124.9
2025-01-01,133.9,122.1,115.8,157.6,87.8,108.8,115.6,116.0,134.9,132.4,...,129.1,133.7,132.5,125.1,103.4,100.1,138.1,128.3,128.0,125.4


## Inflation

In [21]:
file_path = 'cpi_inflation.xlsx'
cpi = pd.read_excel(file_path, header=0, decimal=",")
cpi

Unnamed: 0,date,cpi
0,2001M01,76.7
1,2001M02,77.3
2,2001M03,77.7
3,2001M04,78.0
4,2001M05,78.4
...,...,...
285,2024M10,119.6
286,2024M11,119.2
287,2024M12,118.9
288,2025M01,119.6


In [22]:
cpi = standardize_dates(cpi, "date")

In [23]:
# Beregn inflation som år-til-år ændring (i procent)
cpi["Inflation"] = ((cpi["cpi"] - cpi["cpi"].shift(12)) / cpi["cpi"].shift(12)) * 100

# Rund til 4 decimaler (valgfrit)
cpi["Inflation"] = cpi["Inflation"].round(3)

cpi = cpi.drop(columns=["cpi"])

df_inflation = cpi.copy()

In [24]:
df_inflation

Unnamed: 0_level_0,Inflation
Date,Unnamed: 1_level_1
2001-01-01,
2001-02-01,
2001-03-01,
2001-04-01,
2001-05-01,
...,...
2024-10-01,1.614
2024-11-01,1.620
2024-12-01,1.885
2025-01-01,1.528


### Lagged af inflation

In [26]:
# Antag, at din originale dataframe hedder 'df_inflation'
# og at kolonnen hedder som vist på billedet
colname = "Inflation"

# Hvor mange lags du vil have med
num_lags = 3

# Lav en kopi, så vi ikke ændrer det oprindelige
df_lagged_inflation = df_inflation[[colname]].copy()

# Lav laggede kolonner
for lag in range(1, num_lags + 1):
    df_lagged_inflation[f"inflation_lag{lag}"] = df_lagged_inflation[colname].shift(lag)
    
# Tilføj rullende gennemsnit (fx over de seneste 3 måneder)
df_lagged_inflation["inflation_rolling_mean3"] = df_inflation[colname].rolling(window=3).mean()

# Fjern rækker med NaN (de første pga. lags)

df_lagged_inflation = df_lagged_inflation.dropna()
# Fjern target-kolonnen
df_lagged_inflation = df_lagged_inflation.drop(columns=["Inflation"])


# Se resultat
df_lagged_inflation

Unnamed: 0_level_0,inflation_lag1,inflation_lag2,inflation_lag3,inflation_rolling_mean3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2002-04-01,2.574,2.458,2.477,2.532000
2002-05-01,2.564,2.574,2.458,2.393000
2002-06-01,2.041,2.564,2.574,2.258667
2002-07-01,2.171,2.041,2.564,2.129667
2002-08-01,2.177,2.171,2.041,2.217667
...,...,...,...,...
2024-10-01,1.278,1.359,1.097,1.417000
2024-11-01,1.614,1.278,1.359,1.504000
2024-12-01,1.620,1.614,1.278,1.706333
2025-01-01,1.885,1.620,1.614,1.677667


## Brutto ledige

In [28]:
file_path = 'ledige.xlsx'
antal_ledige = pd.read_excel(file_path, header=0, decimal=",")

df_antal_ledige = standardize_dates(antal_ledige, "date")
#df_antal_ledige = df_antal_ledige.resample("MS").ffill()
df_antal_ledige

Unnamed: 0_level_0,Bruttoledige,Ledige dagpengemodtagere,Ledige kontanthjælpsmodtagere
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-01-01,119018.0,90915.5,28102.5
2007-02-01,115165.1,87461.5,27703.5
2007-03-01,110045.9,83156.5,26889.5
2007-04-01,108612.7,82269.8,26343.1
2007-05-01,105135.8,79468.4,25667.4
...,...,...,...
2024-10-01,88010.4,75429.7,12580.7
2024-11-01,88116.1,75548.4,12567.7
2024-12-01,87942.8,75474.8,12468.0
2025-01-01,87034.0,74740.8,12293.2


## Ledige i pct ( skal måske slettes?)

In [30]:
file_path = 'ledige_procent.xlsx'
ledige_procent = pd.read_excel(file_path, header=0, decimal=",")
#ledige.set_index("date", inplace=True)
ledige_procent

Unnamed: 0,date,Bruttoledige (i pct. af arbejdsstyrken)
0,2007M01,4.3
1,2007M02,4.1
2,2007M03,3.9
3,2007M04,3.9
4,2007M05,3.8
...,...,...
213,2024M10,2.9
214,2024M11,2.9
215,2024M12,2.9
216,2025M01,2.9


In [31]:
df_ledige_procent = standardize_dates(ledige_procent, "date")

#df_ledige_procent = df_ledige_procent.resample("MS").ffill()
df_ledige_procent

Unnamed: 0_level_0,Bruttoledige (i pct. af arbejdsstyrken)
Date,Unnamed: 1_level_1
2007-01-01,4.3
2007-02-01,4.1
2007-03-01,3.9
2007-04-01,3.9
2007-05-01,3.8
...,...
2024-10-01,2.9
2024-11-01,2.9
2024-12-01,2.9
2025-01-01,2.9


## Antal af fødte børn

In [33]:
file_path = 'fødte_børn.csv'

In [34]:
børn = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0)

df_børn = standardize_dates(børn, "year")

#df_børn = df_børn.resample("MS").ffill()
df_børn.shape

(24, 2)

In [35]:
# Tilføj rækker frem til 2025-02-01
#future_index = pd.date_range(start=df_børn.index.min(), end="2025-02-01", freq="MS")

# Reindex og ffill for at fylde manglende data
#df_børn = df_børn.reindex(future_index).ffill()
#df_børn.index.name = "Date"
df_børn

Unnamed: 0_level_0,F¿dte Drenge,F¿dte Piger
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2001-01-01,33497,31961
2002-01-01,32966,31109
2003-01-01,33158,31441
2004-01-01,33070,31539
2005-01-01,32823,31459
2006-01-01,33404,31580
2007-01-01,32815,31267
2008-01-01,33531,31507
2009-01-01,32261,30557
2010-01-01,32465,30946


## Econ indikator

In [37]:
file_path = 'econ_indikator.xlsx'
econ_indikator = pd.read_excel(file_path, header=0, decimal=",")
econ_indikator

Unnamed: 0,date,"B.1*g Bruttonationalprodukt, BNP",P.7 Import af varer og tjenester,P.71 Import af varer,P.72 Import af tjenester,Forsyning i alt,P.6 Eksport af varer og tjenester,P.61 Eksport af varer,P.62 Eksport af tjenester,P.31 Privatforbrug,...,N.112 Andre bygninger og anlæg,"N.113-N.115 Maskiner, transportmidler mv.",N.117 Intellektuelle rettigheder,P.52+P.53 Lagerforøgelser mv.,P.52 Lagerforøgelser,P.53 Anskaffelser minus afhændelser af værdigenstande,Endelig indenlandsk anvendelse,Endelig anvendelse i alt,Samlede præsterede timer (mio. timer),Samlet antal beskæftigede (1000 personer)
0,2001K1,337.7,131.1,88.8,42.3,468.8,159.1,104.4,54.7,158.2,...,18.2,25.9,12.0,0.6,0.2,0.4,310.2,469.3,1024.7,2781.6
1,2001K2,340.8,135.3,91.5,43.8,476.2,158.7,104.0,54.7,161.5,...,18.6,27.7,12.7,-0.7,-1.2,0.5,316.9,475.6,1029.1,2786.8
2,2001K3,344.9,131.3,89.5,41.8,476.2,154.2,103.7,50.5,161.8,...,19.3,26.0,12.8,3.5,3.0,0.5,322.5,476.7,1018.4,2777.8
3,2001K4,348.9,129.7,88.7,41.0,478.6,153.3,102.5,50.8,160.6,...,19.5,27.4,13.4,3.2,2.8,0.4,324.8,478.1,1014.4,2783.7
4,2002K1,348.7,138.4,91.9,46.6,487.1,160.4,106.0,54.3,163.8,...,18.5,28.0,12.9,1.5,1.0,0.6,325.8,486.2,1012.5,2780.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,2023K4,699.2,424.2,228.7,195.5,1123.5,481.9,280.8,201.1,328.9,...,39.4,28.6,43.6,1.1,0.1,1.0,640.1,1121.9,1107.3,3210.1
92,2024K1,711.3,420.9,228.7,192.2,1132.2,484.2,280.4,203.8,330.0,...,40.4,32.2,43.6,-7.4,-8.5,1.0,640.0,1124.2,1116.6,3221.9
93,2024K2,723.1,429.0,229.4,199.6,1152.1,502.3,287.0,215.2,331.6,...,41.6,33.6,44.1,-3.1,-4.1,1.0,653.9,1156.2,1108.6,3225.3
94,2024K3,748.6,430.6,231.6,198.9,1179.2,524.4,299.7,224.7,332.8,...,42.8,33.7,44.8,-3.9,-4.9,1.0,660.1,1184.5,1109.6,3228.5


In [38]:
df_econ_indikator = standardize_dates(econ_indikator, "date")

In [39]:
#df_econ_indikator = df_econ_indikator.resample("MS").ffill()
df_econ_indikator.shape

(96, 31)

In [40]:
# Tilføj rækker frem til 2025-02-01
#future_index = pd.date_range(start=df_econ_indikator.index.min(), end="2025-02-01", freq="MS")

# Reindex og ffill for at fylde manglende data
#df_econ_indikator = df_econ_indikator.reindex(future_index).ffill()
#df_econ_indikator.index.name = "Date"
df_econ_indikator

Unnamed: 0_level_0,"B.1*g Bruttonationalprodukt, BNP",P.7 Import af varer og tjenester,P.71 Import af varer,P.72 Import af tjenester,Forsyning i alt,P.6 Eksport af varer og tjenester,P.61 Eksport af varer,P.62 Eksport af tjenester,P.31 Privatforbrug,P.31 Husholdningernes forbrugsudgifter,...,N.112 Andre bygninger og anlæg,"N.113-N.115 Maskiner, transportmidler mv.",N.117 Intellektuelle rettigheder,P.52+P.53 Lagerforøgelser mv.,P.52 Lagerforøgelser,P.53 Anskaffelser minus afhændelser af værdigenstande,Endelig indenlandsk anvendelse,Endelig anvendelse i alt,Samlede præsterede timer (mio. timer),Samlet antal beskæftigede (1000 personer)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-01,337.7,131.1,88.8,42.3,468.8,159.1,104.4,54.7,158.2,152.9,...,18.2,25.9,12.0,0.6,0.2,0.4,310.2,469.3,1024.7,2781.6
2001-04-01,340.8,135.3,91.5,43.8,476.2,158.7,104.0,54.7,161.5,156.2,...,18.6,27.7,12.7,-0.7,-1.2,0.5,316.9,475.6,1029.1,2786.8
2001-07-01,344.9,131.3,89.5,41.8,476.2,154.2,103.7,50.5,161.8,156.4,...,19.3,26.0,12.8,3.5,3.0,0.5,322.5,476.7,1018.4,2777.8
2001-10-01,348.9,129.7,88.7,41.0,478.6,153.3,102.5,50.8,160.6,155.2,...,19.5,27.4,13.4,3.2,2.8,0.4,324.8,478.1,1014.4,2783.7
2002-01-01,348.7,138.4,91.9,46.6,487.1,160.4,106.0,54.3,163.8,158.5,...,18.5,28.0,12.9,1.5,1.0,0.6,325.8,486.2,1012.5,2780.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-01,699.2,424.2,228.7,195.5,1123.5,481.9,280.8,201.1,328.9,318.4,...,39.4,28.6,43.6,1.1,0.1,1.0,640.1,1121.9,1107.3,3210.1
2024-01-01,711.3,420.9,228.7,192.2,1132.2,484.2,280.4,203.8,330.0,319.2,...,40.4,32.2,43.6,-7.4,-8.5,1.0,640.0,1124.2,1116.6,3221.9
2024-04-01,723.1,429.0,229.4,199.6,1152.1,502.3,287.0,215.2,331.6,320.6,...,41.6,33.6,44.1,-3.1,-4.1,1.0,653.9,1156.2,1108.6,3225.3
2024-07-01,748.6,430.6,231.6,198.9,1179.2,524.4,299.7,224.7,332.8,321.6,...,42.8,33.7,44.8,-3.9,-4.9,1.0,660.1,1184.5,1109.6,3228.5


### Valutakurser

In [42]:
file_path = 'valutakurser.xlsx'

In [43]:
Valutakurser = pd.read_excel(file_path, header=0, decimal=",")

In [44]:
df_valutakurser = standardize_dates(Valutakurser, "date")

In [45]:
#df_valutakurser = df_valutakurser.resample("MS").ffill()
df_valutakurser

Unnamed: 0_level_0,EUR Euro (Jan. 1999-),USD Amerikanske dollar (Jan. 1979-),GBP Britiske pund (Jan. 1981-),SEK Svenske kroner (Jan. 1980-)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-01-01,746.4373,795.0623,1175.5700,83.8845
2001-02-01,746.3120,809.4930,1176.8350,83.1745
2001-03-01,746.4391,820.6772,1186.0940,81.7718
2001-04-01,746.3239,835.9222,1199.7760,81.8889
2001-05-01,746.1628,852.5195,1215.6630,82.3338
...,...,...,...,...
2024-10-01,745.9265,684.1126,893.3804,65.4065
2024-11-01,745.8276,701.7319,894.5195,64.3914
2024-12-01,745.8878,711.1756,900.8617,64.8328
2025-01-01,746.0895,720.6409,889.2100,64.9918


### Transaktioner med kort

In [47]:
file_path = 'transaktioner_medkort.xlsx'
transaktioner_medkort = pd.read_excel(file_path, header=0, decimal=",")
transaktioner_medkort.shape

(2253, 3)

In [48]:
df_transaktioner_medkort = standardize_dates(transaktioner_medkort, "date")

In [49]:
#df_transaktioner_medkort = df_transaktioner_medkort.resample("MS").ffill()
df_transaktioner_medkort

Unnamed: 0_level_0,Transaktioner - Værdi (kr.),Antal transaktioner med kort (stk.)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,532818728,2324456
2019-01-02,1414651369,5400415
2019-01-03,1371597497,5210724
2019-01-04,1418872608,5370442
2019-01-05,1223540188,5398314
...,...,...
2025-02-26,1307597069,5286448
2025-02-27,1477954530,5722619
2025-02-28,2128094675,7265294
2025-03-01,2136274605,8309405


### antal_ejendomsalg

In [51]:
file_path = 'ejdendom_antalsalg.xlsx'
ejendomsalg = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_ejendomsalg = standardize_dates(ejendomsalg, "date")

In [52]:
#df_ejendomsalg = df_ejendomsalg.resample("MS").ffill()
df_ejendomsalg

Unnamed: 0_level_0,Total salg af Enfamiliehuse,Total salg af Sommerhuse,Total salg af Ejerlejligheder
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-01-01,14398,2915,6104
2005-04-01,15155,3141,6602
2005-07-01,15217,2663,6518
2005-10-01,15047,2827,6031
2006-01-01,14427,2668,5664
...,...,...,...
2023-07-01,12127,2042,4371
2023-10-01,12488,2421,4863
2024-01-01,11822,1995,3171
2024-04-01,12834,2140,4049


## Medlemer af Folkekirken

In [54]:
file_path = 'medlemer_folkekirken.csv'
folkekirken_medlemmer = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0)

# Kør standardiseringsfunktionen
df_folkekirken_medlemmer = standardize_dates(folkekirken_medlemmer, "date")

In [55]:
#df_folkekirken_medlemmer = df_folkekirken_medlemmer.resample("MS").ffill()
df_folkekirken_medlemmer

Unnamed: 0_level_0,Medlemmer af Folkekirken
Date,Unnamed: 1_level_1
2007-01-01,4499343
2007-04-01,4496926
2007-07-01,4497951
2007-10-01,4498876
2008-01-01,4494589
...,...
2024-01-01,4253575
2024-04-01,4246873
2024-07-01,4245853
2024-10-01,4241820


## Antal Totale Skilsmisser

In [57]:
file_path = 'total_skilsmisser.csv'
total_skilsmisser = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0)

# Kør standardiseringsfunktionen
df_total_skilsmisser = standardize_dates(total_skilsmisser, "date")

In [58]:
#df_total_skilsmisser = df_total_skilsmisser.resample("MS").ffill()
df_total_skilsmisser

Unnamed: 0_level_0,Antal Skilsmisser
Date,Unnamed: 1_level_1
2001-01-01,14692
2002-01-01,15411
2003-01-01,15884
2004-01-01,15877
2005-01-01,15438
2006-01-01,14454
2007-01-01,14183
2008-01-01,14821
2009-01-01,15073
2010-01-01,14600


## Vielser

In [60]:
file_path = 'vielser.xlsx'
vielser = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_vielser = standardize_dates(vielser, "year")

In [61]:
#df_vielser = df_vielser.resample("MS").ffill()
df_vielser

Unnamed: 0_level_0,Gennemsnitsalder for 1. gangsviede mænd,Gennemsnitsalder for 1. gangsviede kvinder,Vielser i alt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,32.8,30.3,36909
2002-01-01,33.2,30.8,37508
2003-01-01,33.5,31.2,35357
2004-01-01,33.8,31.5,38046
2005-01-01,33.9,31.5,36538
2006-01-01,34.0,31.6,36843
2007-01-01,34.2,31.9,37000
2008-01-01,34.4,32.0,37812
2009-01-01,34.1,31.6,33322
2010-01-01,34.2,31.7,31359


## total_hustande

In [63]:
file_path = 'total_hustande.csv'
total_hustande = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0)

# Kør standardiseringsfunktionen
df_total_hustande = standardize_dates(total_hustande, "year")

In [64]:
#df_total_hustande = df_total_hustande.resample("MS").ffill()
df_total_hustande

Unnamed: 0_level_0,"Hustande, Hele landet"
Date,Unnamed: 1_level_1
2001-01-01,2448490
2002-01-01,2459203
2003-01-01,2469332
2004-01-01,2482150
2005-01-01,2499606
2006-01-01,2517137
2007-01-01,2532058
2008-01-01,2547377
2009-01-01,2563903
2010-01-01,2573417


## Fertilitetskvotienter

In [66]:
file_path = 'Fertilitetskvotienter.xlsx'
Fertilitetskvotienter = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_fertilitetskvotienter = standardize_dates(Fertilitetskvotienter, "year")

In [67]:
#df_fertilitetskvotienter = df_fertilitetskvotienter.resample("MS").ffill()
df_fertilitetskvotienter

Unnamed: 0_level_0,Samlet fertilitet
Date,Unnamed: 1_level_1
2001-01-01,1745.0
2002-01-01,1724.9
2003-01-01,1759.9
2004-01-01,1785.2
2005-01-01,1802.1
2006-01-01,1847.6
2007-01-01,1843.5
2008-01-01,1888.8
2009-01-01,1839.6
2010-01-01,1871.2


## Implicit Lønindeks

In [69]:
file_path = 'Implicit_lønindeks.xlsx'
implicit_lønindeks = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_implicit_lønindeks = standardize_dates(implicit_lønindeks, "date")

In [70]:
#df_implicit_lønindeks = df_implicit_lønindeks.resample("MS").ffill()

In [71]:
df_implicit_lønindeks

Unnamed: 0_level_0,Forskning og udvikling,P Undervisning,R Kultur og fritid,"2 Industri, råstofindvinding og forsyningsvirksomhed",3 Bygge og anlæg,4 Handel og transport mv.,5 Information og kommunikation,6 Finansiering og forsikring,7 Ejendomshandel og udlejning,8 Erhvervsservice,"10 Kultur, fritid og anden service"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2005-04-01,103.9,103.3,102.1,100.4,100.2,100.6,100.7,100.7,100.6,100.4,100.6
2005-07-01,103.7,102.7,102.3,101.3,101.2,101.4,101.2,101.7,101.6,101.1,101.2
2005-10-01,104.2,102.4,102.3,102.1,102.3,102.2,101.9,102.7,102.4,101.9,101.6
2006-01-01,104.5,102.6,102.9,102.9,103.3,102.8,102.7,103.5,103.0,102.7,102.4
...,...,...,...,...,...,...,...,...,...,...,...
2023-10-01,149.2,152.4,150.7,161.3,154.1,149.4,155.7,169.8,153.6,158.3,137.9
2024-01-01,149.6,152.8,151.2,162.8,155.0,150.6,157.3,171.4,154.8,159.3,139.0
2024-04-01,158.5,162.1,159.6,167.1,159.9,154.0,159.3,173.2,157.6,161.0,140.5
2024-07-01,159.3,162.4,161.1,167.0,158.9,154.1,160.3,174.8,158.0,162.5,141.1


## antal_beskæftigede

In [73]:
file_path = 'antal_beskæftigede.xlsx'
antal_beskæftigede = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_antal_beskæftigede = standardize_dates(antal_beskæftigede, "year")

In [74]:
#df_antal_beskæftigede = antal_beskæftigede.resample("MS").ffill()
df_antal_beskæftigede  # Se de første 12 måneder

Unnamed: 0_level_0,Beskæftigede Mænd,Beskæftigede Kvinder
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-01-01,1493769,1351045
2009-01-01,1410785,1314939
2010-01-01,1404274,1300268
2011-01-01,1410582,1287836
2012-01-01,1404143,1280697
2013-01-01,1409449,1284499
2014-01-01,1426324,1293516
2015-01-01,1450460,1308949
2016-01-01,1477251,1327822
2017-01-01,1500292,1348041


## Forbrugerforventninger

In [76]:
file_path = 'Forbrugerforventninger.xlsx'
Forbrugerforventninger = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_forbrugerforventninger = standardize_dates(Forbrugerforventninger, "date")

In [77]:
#df_forbrugerforventninger = df_forbrugerforventninger.resample("MS").ffill()
df_forbrugerforventninger  # Se de første 12 måneder

Unnamed: 0_level_0,Forbrugertillidsindikatoren,"Familiens økonomiske situation i dag, sammenlignet med for et år siden","Familiens økonomiske situation om et år, sammenlignet med i dag","Danmarks økonomiske situation i dag, sammenlignet med for et år siden","Danmarks økonomiske situation om et år, sammenlignet med i dag","Anskaffelse af større forbrugsgoder, fordelagtigt for øjeblikket","Priser i dag, sammenlignet med for et år siden","Priser om et år, sammenlignet med i dag","Arbejdsløsheden om et år, sammenlignet med i dag","Anskaffelse af større forbrugsgoder, inden for de næste 12 mdr.",Anser det som fornuftigt at spare op i den nuværende økonomiske situation,Regner med at kunne spare op i de kommende 12 måneder,Familiens økonomiske situation lige nu: kan spare/penge slår til/ bruger mere end man tjener
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2001-01-01,3.0,7.0,13.0,3.0,-3.0,-3.0,-13.0,-12.0,0.0,-7.0,69.0,25.0,25.0
2001-02-01,1.0,3.0,17.0,0.0,-4.0,-9.0,-12.0,-11.0,-2.0,-6.0,74.0,26.0,22.0
2001-03-01,0.0,3.0,13.0,-3.0,-8.0,-6.0,-15.0,-10.0,1.0,-9.0,76.0,32.0,27.0
2001-04-01,-2.0,4.0,14.0,-9.0,-9.0,-10.0,-14.0,-11.0,5.0,-6.0,72.0,23.0,24.0
2001-05-01,0.0,8.0,12.0,-5.0,-6.0,-9.0,-7.0,-7.0,3.0,-6.0,73.0,30.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-01,-8.9,-8.9,-1.0,-9.8,-8.5,-16.1,28.0,21.0,10.2,-8.0,65.3,26.1,26.9
2024-11-01,-9.3,-10.1,0.0,-12.5,-10.9,-13.1,27.4,23.2,13.7,-3.7,67.2,30.1,25.3
2024-12-01,-13.1,-11.2,-1.2,-17.2,-18.0,-18.0,31.6,29.6,13.0,-7.5,65.0,26.8,24.0
2025-01-01,-11.7,-5.3,-1.0,-15.9,-18.6,-17.8,32.8,29.2,10.1,-7.0,62.4,23.8,26.3


## Befolkningens udvikling

In [79]:
file_path = 'befolkningens_udvikling.xlsx'
befolkningens_udvikling = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
befolkningens_udvikling = standardize_dates(befolkningens_udvikling, "date")

In [80]:
df_befolkningens_udvikling = befolkningens_udvikling.resample("MS").ffill()
df_befolkningens_udvikling  # Se de første 12 måneder

Unnamed: 0_level_0,Befolkningen ultimo forrige kvartal,Levendefødte,Døde,Fødselsoverskud,Tilflyttede,Fraflyttede,Nettotilflyttede,Indvandrede i alt,Indvandret i indeværende kvartal,Indvandret før indeværende kvartal,Udvandrede i alt,Udvandret i indeværende kvartal,Udvandret før indeværende kvartal,Nettoindvandrede,Korrektioner,Befolkningstilvækst,Befolkningen ultimo indeværende kvartal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2007-04-01,5451995,15867,13579,2288,61780,61780,0,12722,12178,544,9330,9330,0,3392,-253,5427,5457422
2007-05-01,5451995,15867,13579,2288,61780,61780,0,12722,12178,544,9330,9330,0,3392,-253,5427,5457422
2007-06-01,5451995,15867,13579,2288,61780,61780,0,12722,12178,544,9330,9330,0,3392,-253,5427,5457422
2007-07-01,5457422,17177,13037,4140,92588,92588,0,24161,23384,777,14099,14099,0,10062,-715,13487,5470909
2007-08-01,5457422,17177,13037,4140,92588,92588,0,24161,23384,777,14099,14099,0,10062,-715,13487,5470909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-01,5967824,14351,13580,771,71563,71563,0,19464,18235,1229,15394,11172,4222,4070,-245,4596,5972420
2024-07-01,5972420,14942,13701,1241,100951,100951,0,37612,35622,1990,21049,16009,5040,16563,-239,17565,5989985
2024-08-01,5972420,14942,13701,1241,100951,100951,0,37612,35622,1990,21049,16009,5040,16563,-239,17565,5989985
2024-09-01,5972420,14942,13701,1241,100951,100951,0,37612,35622,1990,21049,16009,5040,16563,-239,17565,5989985


## Antal på kontanthjælp

In [82]:
file_path = 'antal_kontanthjælp.xlsx'
antal_kontanthjælp = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_antal_kontanthjælp = standardize_dates(antal_kontanthjælp, "date")

#df_antal_kontanthjælp = df_antal_kontanthjælp.resample("MS").ffill()
df_antal_kontanthjælp  # Se de første 12 måneder

Unnamed: 0_level_0,"Kontanthjælp, I alt"
Date,Unnamed: 1_level_1
2007-01-01,125820
2007-02-01,124115
2007-03-01,123046
2007-04-01,121673
2007-05-01,120474
...,...
2024-08-01,88837
2024-09-01,88768
2024-10-01,88742
2024-11-01,88370


## Antal overtrædelse

In [84]:
file_path = 'antal_overtrædelse.xlsx'
antal_overtrædelse = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_antal_overtrædelse = standardize_dates(antal_overtrædelse, "date")

#df_antal_overtrædelse = df_antal_overtrædelse.resample("MS").ffill()
df_antal_overtrædelse  # Se de første 12 måneder

Unnamed: 0_level_0,Overtrædelsens art i alt,Straffelov i alt,Seksualforbrydelser i alt,Voldsforbrydelser i alt,Ejendomsforbrydelser i alt,Særlove i alt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-01,127582,111675,627,3629,105565,15907
2001-04-01,132619,116253,637,3994,109886,16366
2001-07-01,138951,122327,820,4070,115715,16624
2001-10-01,137933,121822,609,3889,115478,16111
2002-01-01,133321,118399,643,3794,111839,14922
...,...,...,...,...,...,...
2023-10-01,104787,78042,2011,6703,66082,26745
2024-01-01,102639,74989,1720,6238,63160,27650
2024-04-01,116061,85050,2145,7346,71908,31011
2024-07-01,112860,83807,1938,6945,71727,29053


## PPI Samlet Dansk produktion

In [86]:
file_path = 'PPI_samlet_DK.xlsx'
PPI = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_PPI_data = standardize_dates(PPI, "date")

#df_PPI_data = df_PPI_data.resample("MS").ffill()
df_PPI_data = df_PPI_data.add_prefix("PPI_domestic_")
df_PPI_data  # Se de første 12 måneder

Unnamed: 0_level_0,PPI_domestic_Unnamed: 1,PPI_domestic_Unnamed: 2,PPI_domestic_Unnamed: 3,PPI_domestic_Unnamed: 4,PPI_domestic_Unnamed: 5,PPI_domestic_Unnamed: 6,PPI_domestic_Unnamed: 7,PPI_domestic_Unnamed: 8,PPI_domestic_Unnamed: 9,PPI_domestic_Unnamed: 10,...,PPI_domestic_Unnamed: 30,PPI_domestic_Unnamed: 31,PPI_domestic_Unnamed: 32,PPI_domestic_Unnamed: 33,PPI_domestic_Unnamed: 34,PPI_domestic_Unnamed: 35,PPI_domestic_Unnamed: 36,PPI_domestic_Unnamed: 37,PPI_domestic_Unnamed: 38,PPI_domestic_Unnamed: 39
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,,,,,,,,,,,...,,,,,,,,,,
2005-02-01,1.3,0.7,5.4,0.3,-0.5,0.0,-5.1,-0.1,-0.1,0.1,...,0.0,,,0.0,0.1,-0.1,-0.2,5.7,,
2005-03-01,2.5,1.4,10.3,0.0,-0.3,0.6,-0.2,-0.8,-0.7,-0.8,...,-1.5,,,-0.1,0.0,-0.3,0.0,10.8,,
2005-04-01,0.8,1.0,5.5,0.4,-0.4,-0.5,-0.3,0.0,-0.7,-0.6,...,0.1,,,0.4,-0.1,-0.1,-0.2,3.5,,
2005-05-01,0.0,-0.3,-2.6,-0.1,0.4,-1.2,3.0,0.5,0.7,0.6,...,-0.7,,,0.4,0.3,0.1,0.1,-0.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-01,3.5,2.4,2.5,2.3,0.2,,0.1,,1.0,0.2,...,0.0,16.0,0.0,0.7,0.3,0.1,4.5,10.2,2.6,6.3
2024-12-01,1.3,1.4,1.1,1.5,0.9,,1.6,,0.2,0.4,...,0.2,-0.3,-0.1,0.1,0.4,0.0,2.6,0.8,1.4,3.2
2025-01-01,-0.1,-0.6,1.7,-0.8,0.8,,2.7,,0.1,0.1,...,-0.9,4.7,5.5,0.4,0.2,0.8,-3.1,5.4,-1.3,-5.5
2025-02-01,0.9,-0.2,-0.1,-0.2,-0.2,,-2.0,,0.1,1.4,...,0.0,11.9,0.0,0.2,0.1,0.2,-0.2,5.5,0.1,0.0


## PPI Import

In [88]:
file_path = 'PPI_import.xlsx'
PPI_import = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
Df_PPI_import_data = standardize_dates(PPI_import, "date")

#Df_PPI_import_data = Df_PPI_import_data.resample("MS").ffill()
Df_PPI_import_data = Df_PPI_import_data.add_prefix("PPI_import_")
Df_PPI_import_data  # Se de første 12 måneder

Unnamed: 0_level_0,"PPI_import_BCDE Råstofindvinding, Industri, Energi- og Vandforsyning",PPI_import_BC Råstofindvinding og industri,PPI_import_B Råstofindvinding,PPI_import_C Fremstillingsvirksomhed,PPI_import_10 Fremstilling af fødevarer,PPI_import_101 Forarbejdning og konservering af kød og produktion af kødprodukter,"PPI_import_102 Forarbejdning og konservering af fisk, krebsdyr og bløddyr",PPI_import_105 Fremstilling af mejeriprodukter,PPI_import_107 Fremstilling af bageri- og dejprodukter,PPI_import_108 Fremstilling af andre fødevarer,...,PPI_import_31 Fremstilling af møbler,PPI_import_32 Anden fremstillingsvirksomhed,"PPI_import_D El-, gas- og fjernvarmeforsyning",PPI_import_S1 Investeringsgodeindustri,PPI_import_S2 Mellemproduktindustri,PPI_import_S3 Fremstilling af varige forbrugsgoder,PPI_import_S4 Fremstilling af ikke-varige forbrugsgoder,PPI_import_S5 Fremstilling af energiprodukter og energiforsyning,"PPI_import_C33 Fremstillingsindustri ekskl. forarb. af kød og fisk samt ekskl. fremst. af tobak, koks og raffinerede mineral olieprodukter",PPI_import_S6 Fremstilling af forbrugsgoder ekskl. fødevarer
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,,,,,,,,,,,...,,,,,,,,,,
2005-02-01,0.5,0.5,3.4,0.1,1.3,,,,,,...,-0.1,0.3,20.5,0.4,0.0,0.2,0.6,4.4,,
2005-03-01,1.0,0.7,11.5,0.6,1.0,,,,,,...,1.4,0.4,16.8,-0.7,0.1,0.7,0.5,14.4,,
2005-04-01,-0.2,-0.2,0.4,-0.1,0.2,,,,,,...,0.0,-0.4,-7.2,-0.1,-0.3,0.0,0.0,-2.0,,
2005-05-01,0.0,0.0,-4.2,0.1,-0.6,,,,,,...,0.6,-0.1,-3.5,-0.7,0.5,0.4,-0.2,1.3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-01,0.9,0.6,2.2,0.6,0.4,-0.4,-0.3,2.1,-0.6,0.8,...,0.1,0.5,82.0,0.6,0.2,0.0,0.7,6.2,0.4,0.6
2024-12-01,0.2,0.3,0.2,0.2,0.0,0.1,1.4,0.8,-0.1,-0.7,...,0.2,0.6,-0.9,0.3,0.3,0.2,-0.2,0.7,0.2,-0.2
2025-01-01,0.8,0.6,4.9,0.3,0.8,-0.7,-0.4,0.9,1.6,1.7,...,2.2,0.1,5.2,-0.2,-0.1,0.5,0.6,5.0,0.2,0.4
2025-02-01,1.1,0.4,-2.4,0.7,-0.5,-0.7,1.1,2.8,-0.6,-1.9,...,0.7,0.1,24.7,-0.1,0.8,0.2,1.7,3.5,0.8,2.4


## Bil data

In [90]:
file_path = 'bil_data.xlsx'
bil_data = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_bil_data = standardize_dates(bil_data, "date")

#df_bil_data = df_bil_data.resample("MS").ffill()
df_bil_data  # Se de første 12 måneder

Unnamed: 0_level_0,Personbiler i alt,"Tilgang af personbiler i alt, sæsonkorrigeret",Tilgang til husholdningerne,"Tilgang til husholdningerne, sæsonkorrigeret",Personbiler i husholdningerne,Privatleasing,Tilgang til erhvervene,"Tilgang til erhvervene, sæsonkorrigeret",Erhvervenes køb af personbiler,Erhvervsleasing,Leasingbiler i alt
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01,11684,12154,8250,8033,8015,235,3434,4122,1955,1479,1714
2007-02-01,10810,12732,6974,7984,6760,214,3836,4748,2003,1833,2047
2007-03-01,14443,13020,9481,8155,9200,281,4962,4866,2762,2200,2481
2007-04-01,12274,12552,8286,8037,8077,209,3988,4515,2102,1886,2095
2007-05-01,14342,12854,9369,8272,9093,276,4973,4582,2850,2123,2399
...,...,...,...,...,...,...,...,...,...,...,...
2024-10-01,14385,15101,8745,9214,5222,3523,5640,5887,831,4809,8332
2024-11-01,14955,14924,8700,9122,5359,3341,6255,5802,750,5505,8846
2024-12-01,17431,14905,9728,8768,5701,4027,7703,6137,893,6810,10837
2025-01-01,10859,13440,6867,7744,4382,2485,3992,5695,738,3254,5739


## Konjunkturbarometer

In [92]:
file_path = 'Konjunkturbarometer.xlsx'
konjunkturbarometer = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_konjunkturbarometer = standardize_dates(konjunkturbarometer, "date")

#df_konjunkturbarometer = df_konjunkturbarometer.resample("MS").ffill()
df_konjunkturbarometer  # Se de første 12 måneder

Unnamed: 0_level_0,Tillidsindikator for industri,Tillidsindikator for bygge og anlæg,Tillidsindikator for detailhandel,Tillidsindikator for serviceerhverv
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-01-01,104.6,98.8,93.4,103.1
2001-02-01,103.7,96.8,93.3,102.1
2001-03-01,105.2,95.7,83.7,100.6
2001-04-01,96.1,97.9,89.6,98.2
2001-05-01,95.9,95.4,84.7,99.9
...,...,...,...,...
2024-12-01,104.8,104.3,103.6,104.8
2025-01-01,105.3,102.9,94.6,104.5
2025-02-01,107.3,101.6,102.5,107.0
2025-03-01,107.2,101.3,98.6,105.2


## total energi produktion

In [94]:
file_path = 'Energi_data.xlsx'
energi_data = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_energi_data = standardize_dates(energi_data, "year")

#df_energi_data = df_energi_data.resample("MS").ffill()
df_energi_data  # Se de første 12 måneder

Unnamed: 0_level_0,Total,Oil,Natural Gas,Coal and Coke,"Waste, Non-renewable 2)",Renewable Energy 3),"Electricity Import, Net"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-01,838813.500833,352523.131082,187220.096164,155255.0935,17006.34,121877.068925,4931.771162
2006-01-01,896399.742054,353742.709831,190738.46728,235590.2501,17291.62,124007.995231,-24971.300388
2007-01-01,868838.354325,351139.84554,170962.60692,194789.9195,17889.14,137476.842365,-3420.0
2008-01-01,844973.261392,338682.406276,171777.958261,171558.12393,18679.21,139041.882925,5233.68
2009-01-01,816141.824514,321700.60549,164980.522221,168780.04643,17705.01,141775.760373,1199.88
2010-01-01,847480.528111,319383.569972,186623.226986,160465.19714,17147.95,167946.950784,-4086.366772
2011-01-01,799565.067527,312328.56538,156505.124133,135577.79456,17292.31,173114.96288,4746.310574
2012-01-01,768603.57292,303115.772382,146177.803742,103547.46161,16804.98,180186.62,18770.935186
2013-01-01,776206.703527,295459.673363,137872.169843,135485.32084,16856.354296,186641.14,3892.045185
2014-01-01,743506.324491,292544.608468,118419.776131,109958.50405,17426.045584,194878.29,10279.100258


## Detailomsætningsindeks

In [96]:
file_path = 'Detailomsætningsindeks.xlsx'
detailomsætningsindeks = pd.read_excel(file_path, header=0, decimal=",")

# Kør standardiseringsfunktionen
df_detailomsætningsindeks = standardize_dates(detailomsætningsindeks, "date")

#df_detailomsætningsindeks = df_detailomsætningsindeks.resample("MS").ffill()
df_detailomsætningsindeks  # Se de første 12 måneder

Unnamed: 0_level_0,Detailhandel i alt
Date,Unnamed: 1_level_1
2001-01-01,86.0
2001-02-01,85.6
2001-03-01,85.1
2001-04-01,84.6
2001-05-01,85.7
...,...
2024-11-01,95.7
2024-12-01,95.8
2025-01-01,96.0
2025-02-01,96.8


## ECB data

In [98]:
# Læs filen som rå tekst først
with open('ECB_inflation.csv', 'r') as file:
    lines = file.readlines()

# Split hver linje ved komma
data = [line.strip().split(',') for line in lines]

# Konvertér til DataFrame
columns = [col.strip('"') for col in data[0]] # Første linje = kolonnenavne
rows = [[item.strip('"') for item in row] for row in data[1:]]


# Lav DataFrame
ECB = pd.DataFrame(rows, columns=columns)

# Konverter datatyper
ECB['DATE'] = pd.to_datetime(ECB['DATE'])
ECB[ECB.columns[2]] = ECB[ECB.columns[2]].astype(float)
ECB = ECB.drop(columns=['TIME PERIOD'])

ECB

Unnamed: 0,DATE,HICP - Overall index (ICP.M.U2.N.000000.4.ANR)
0,1997-01-31,2.0
1,1997-02-28,1.8
2,1997-03-31,1.6
3,1997-04-30,1.3
4,1997-05-31,1.4
...,...,...
335,2024-12-31,2.4
336,2025-01-31,2.5
337,2025-02-28,2.3
338,2025-03-31,2.2


In [99]:
# Kør standardiseringsfunktionen
df_ECB = standardize_dates(ECB, "DATE")

# Se det nye format
df_ECB.head()

Unnamed: 0_level_0,HICP - Overall index (ICP.M.U2.N.000000.4.ANR)
Date,Unnamed: 1_level_1
1997-01-01,2.0
1997-02-01,1.8
1997-03-01,1.6
1997-04-01,1.3
1997-05-01,1.4


## HPI

In [101]:
file_path = 'HPI.csv'
HPI = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1', header=0)

# Kør standardiseringsfunktionen
df_hpi = standardize_dates(HPI, "date")

# Se det nye format
df_hpi.head()

Unnamed: 0_level_0,HPI
Date,Unnamed: 1_level_1
2002-10-01,59.8
2003-01-01,60.7
2003-04-01,62.0
2003-07-01,63.1
2003-10-01,63.3


In [102]:
# ME = Month-END
# MS = Month-Start
#df_hpi = df_hpi.resample("MS").ffill()
#df_hpi  # Se de første 12 måneder

# Merge dataframes

## Case 1 

In [105]:
dfs = [df_inflation, df_ledige_procent, df_børn, df_econ_indikator, df_valutakurser, 
       df_transaktioner_medkort, df_ejendomsalg, df_hpi, df_folkekirken_medlemmer, df_total_skilsmisser, 
       df_total_hustande, df_fertilitetskvotienter, df_implicit_lønindeks,
       df_antal_beskæftigede, df_forbrugerforventninger, df_antal_ledige, df_befolkningens_udvikling, df_antal_kontanthjælp, df_antal_overtrædelse,
       df_bil_data, df_energi_data, df_vielser, Df_PPI_import_data, df_PPI_data, df_konjunkturbarometer, df_detailomsætningsindeks, df_ECB,
       df_lagged_inflation, df_niveau_0, df_niveau_1, df_niveau_2] # "


# Merge 
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs)

# Vigtigt!!
#df_merged = df_merged.loc["2008-01-01":"2025-01-01"]
df_merged = df_merged.loc["2012-01-01":"2024-12-01"]

# check missing values
missing_info = check_missing_values(df_merged)  


# Delete the last row, to match inflation
#df_merged = df_merged.iloc[:-1]  # Fjerner sidste række

# Display
df_merged

Total missing values in dataset: 562547


Unnamed: 0_level_0,Inflation,Bruttoledige (i pct. af arbejdsstyrken),F¿dte Drenge,F¿dte Piger,"B.1*g Bruttonationalprodukt, BNP",P.7 Import af varer og tjenester,P.71 Import af varer,P.72 Import af tjenester,Forsyning i alt,P.6 Eksport af varer og tjenester,...,10.4 Videregående uddannelse,10.5 Undervisning uden for niveau,"11.1 Restauranter, cafeer og kantiner mv.",11.2 Overnatningsfaciliteter,12.1 Personlig pleje,12.3 Andre personlige effekter,12.4 Daginstitutioner og social forsorg,12.5 Forsikring,12.6 Finansielle tjenester,12.7 Andre tjenester
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01,2.763,5.9,29785.0,28131.0,467.5,231.9,143.8,88.1,699.4,255.1,...,90.4,90.4,92.8,102.8,98.9,96.7,96.8,98.7,86.9,91.7
2012-02-01,2.731,6.0,,,,,,,,,...,90.4,90.4,93.7,101.7,98.9,98.5,96.8,98.7,86.9,91.7
2012-03-01,2.610,6.1,,,,,,,,,...,90.4,90.4,93.9,101.7,98.7,98.6,96.8,98.7,86.9,91.7
2012-04-01,2.183,6.2,,,472.9,232.9,144.9,88.1,705.9,264.6,...,90.4,90.4,94.1,101.7,99.3,99.5,96.8,98.7,87.5,93.9
2012-05-01,2.181,6.2,,,,,,,,,...,90.4,90.4,95.0,106.5,99.6,99.7,98.2,98.7,87.6,93.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-27,,,,,,,,,,,...,,,,,,,,,,
2024-11-28,,,,,,,,,,,...,,,,,,,,,,
2024-11-29,,,,,,,,,,,...,,,,,,,,,,
2024-11-30,,,,,,,,,,,...,,,,,,,,,,


In [106]:
#df_inflation = df_inflation.loc[df_merged.index]

# MIDAS

In [108]:
from midas.mix import mix_freq, mix_freq2
from midas.adl import estimate, forecast, midas_adl, rmse, estimate2,forecast2, midas_adl2

In [109]:
def build_midas_matrix(df, y_col, horizon=1, y_lags=0, verbose=True):
    y = df[[y_col]].dropna()
    y.index = pd.to_datetime(y.index)

    X_list = []
    valid_cols = []

    for col in df.columns:
        if col == y_col:
            continue

        series = df[[col]].dropna()
        series.index = pd.to_datetime(series.index)

        freq = infer_column_frequency(series)
        if freq is None:
            continue

        num_lags = freq_to_lags.get(freq[0].upper(), None)
        if num_lags is None:
            continue

        try:
            _, _, X, _, _, _ = mix_freq(y[y_col], series[col], num_lags, y_lags, horizon)
            X_list.append(X)
            valid_cols.append(col)
            if verbose:
                print(f"Added: {col} (freq: {freq}, lags: {num_lags})")
        except Exception as e:
            if verbose:
                print(f"Skipped: {col} (error: {str(e)})")
            continue


    
    if X_list:
        # Merge alle features på fælles datoer
        from functools import reduce
        X_total = reduce(lambda l, r: l.join(r, how='inner'), X_list)

        # Fjern rækker med for mange NaNs (valgfrit)
        X_total = X_total.dropna(thresh=int(0.9 * X_total.shape[1]))

        y_aligned = y.loc[X_total.index][y_col]
        return X_total, y_aligned, valid_cols
    else:
        raise ValueError("Ingen gyldige variabler kunne bruges.")


In [110]:
def infer_column_frequency(series):
    """Infer frequency of a time series (e.g., D, M, Q, A)"""
    try:
        return pd.infer_freq(series.dropna().index)
    except:
        return None

In [111]:
freq_to_lags = {
    'D': 30,    # daglig → 30 lags
    'B': 22,    # business days → 22
    'M': 6,     # månedlig → 6
    'Q': 4,     # kvartalsvis → 4
    'A': 2      # årlig → 2
}

In [112]:
X, y_final, used_columns = build_midas_matrix(df_merged, y_col="Inflation", horizon=1)

Added: Bruttoledige (i pct. af arbejdsstyrken) (freq: MS, lags: 6)
Added: B.1*g Bruttonationalprodukt, BNP (freq: QS-OCT, lags: 4)
Added: P.7 Import af varer og tjenester (freq: QS-OCT, lags: 4)
Added: P.71 Import af varer (freq: QS-OCT, lags: 4)
Added: P.72 Import af tjenester (freq: QS-OCT, lags: 4)
Added: Forsyning i alt (freq: QS-OCT, lags: 4)
Added: P.6 Eksport af varer og tjenester (freq: QS-OCT, lags: 4)
Added: P.61 Eksport af varer (freq: QS-OCT, lags: 4)
Added: P.62 Eksport af tjenester (freq: QS-OCT, lags: 4)
Added: P.31 Privatforbrug (freq: QS-OCT, lags: 4)
Added: P.31 Husholdningernes forbrugsudgifter (freq: QS-OCT, lags: 4)
Added: Køb af køretøjer (freq: QS-OCT, lags: 4)
Added: Andre varer (freq: QS-OCT, lags: 4)
Added: Tjenester i alt inkl. turisme (freq: QS-OCT, lags: 4)
Added: Tjenester i alt (freq: QS-OCT, lags: 4)
Added: Turistudgifter (freq: QS-OCT, lags: 4)
Added: Turistindtægter (freq: QS-OCT, lags: 4)
Added: P.31 Forbrugsudgifter i non-profit institutioner rettet 

In [200]:
X

Unnamed: 0_level_0,0,1,2,3,4,5,0,1,2,3,...,2,3,4,5,0,1,2,3,4,5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Standardize

In [115]:
from sklearn.impute import SimpleImputer

In [116]:
imputer = SimpleImputer(strategy="median")  # Eller "median"

X_imputed = imputer.fit_transform(df_merged)  # Imputer NaN i hele datasættet

# Konverter tilbage til DataFrame med kolonnenavne
X_new = pd.DataFrame(X_imputed, columns=df_merged.columns, index = df_merged.index)
missing_info = check_missing_values(X_new)  

Total missing values in dataset: 0


In [117]:
# Save
save_merged_data(X_new, filename = "Merged_dataset.csv")

✅ Filen 'Merged_dataset.csv' er gemt!


In [118]:
print(f" {X_new.isna().sum().sum()}")


 0
