In [1]:
# imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import pandas as pd

# data cleaning
# arbeitslosenquote
arbeitslosenquote = pd.read_csv(
    "./original data/Arbeitslosenquote.csv", sep=";", decimal=","
)

cols_to_clean_dots = ["Bestand Arbeitslose", "Zugang Arbeitslose", "Abgang Arbeitslose"]
for col in cols_to_clean_dots:
    arbeitslosenquote[col] = (
        arbeitslosenquote[col]
        .astype(str)
        .str.replace(".", "", regex=False)
        .replace("", "0")
        .astype(int)
    )

arbeitslosenquote["Arbeitslosenquote"] = (
    arbeitslosenquote["Arbeitslosenquote"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .replace("", "0")
    .astype(float)
)

month_map = {
    "Januar": "01",
    "Februar": "02",
    "März": "03",
    "April": "04",
    "Mai": "05",
    "Juni": "06",
    "Juli": "07",
    "August": "08",
    "September": "09",
    "Oktober": "10",
    "November": "11",
    "Dezember": "12",
}

def convert_to_date(berichtsmonat_str):
    try:
        month_name, year_str = berichtsmonat_str.split(" ")
        month_num = month_map.get(month_name)
        return f"{year_str}-{month_num}-01"
    except Exception:
        return pd.NaT


arbeitslosenquote["DATE"] = (
    arbeitslosenquote["Berichtsmonat"].astype(str).apply(convert_to_date)
)
arbeitslosenquote["DATE"] = pd.to_datetime(arbeitslosenquote["DATE"], errors="coerce")
arbeitslosenquote["DATE"] = arbeitslosenquote["DATE"] + pd.offsets.MonthEnd(0)

arbeitslosenquote = arbeitslosenquote.drop(columns=["Berichtsmonat"])
cols_order = ["DATE"] + [col for col in arbeitslosenquote.columns if col != "DATE"]
arbeitslosenquote = arbeitslosenquote[cols_order]

# IFO beschaeftigungsbarometer
beschaeftigungsbarometer = pd.read_csv("./original data/IFO_Beschäftigungsbarometer.csv")
beschaeftigungsbarometer["DATE"] = pd.to_datetime(beschaeftigungsbarometer["DATE"], errors="coerce") + pd.offsets.MonthEnd(0)
beschaeftigungsbarometer["DATE"] = beschaeftigungsbarometer["DATE"].dt.strftime("%Y-%m-%d")

# IFO geschäftsklima
geschaeftsklima = pd.read_csv("./original data/IFO_Geschäftsklima.csv")
# Strip whitespace from the DATE column
geschaeftsklima["DATE"] = geschaeftsklima["DATE"].str.strip()
# Convert "DATE" from MM/YYYY to YYYY-MM-DD (set to last day of month)
geschaeftsklima["DATE"] = pd.to_datetime("01/" + geschaeftsklima["DATE"], format="%d/%m/%Y") + pd.offsets.MonthEnd(0)
geschaeftsklima["DATE"] = geschaeftsklima["DATE"].dt.strftime("%Y-%m-%d")
geschaeftsklima["DATE"] = pd.to_datetime(geschaeftsklima["DATE"], errors="coerce") + pd.offsets.MonthEnd(0)
geschaeftsklima["DATE"] = geschaeftsklima["DATE"].dt.strftime("%Y-%m-%d")

# bmw absatzzahlen
bmw_absatzzahlen = pd.read_csv("./original data/BMW_Absatzzahlen.csv")
bmw_absatzzahlen["DATE"] = pd.to_datetime(bmw_absatzzahlen["DATE"], errors="coerce") + pd.offsets.MonthEnd(0)
bmw_absatzzahlen["DATE"] = bmw_absatzzahlen["DATE"].dt.strftime("%Y-%m-%d")

# # bruttoinlandsprodukt
# bruttoinlandsprodukt = pd.read_csv("./original data/Bruttoinlandsprodukt.csv", sep=";", decimal=",")
# # Remove specified columns
# bruttoinlandsprodukt = bruttoinlandsprodukt.drop([
#   "in jeweiligen Preisen, Mrd. EUR, saison- und kalenderbereinigte Werte nach X13 JDemetra+",
#   "preisbereinigt, Kettenindex (2020=100), saison- und kalenderbereinigte Werte nach X13 JDemetra+"
# ], axis=1)
# # Convert "Datum" to "DATE" in YYYY-MM-DD format (set to last day of quarter)
# bruttoinlandsprodukt["DATE"] = pd.to_datetime(bruttoinlandsprodukt["Datum"], format="%d/%m/%Y") + pd.offsets.MonthEnd(0)
# bruttoinlandsprodukt["DATE"] = bruttoinlandsprodukt["DATE"].dt.strftime("%Y-%m-%d")

# # Convert "Datum" to "DATE" in YYYY-MM-DD format
# bruttoinlandsprodukt["DATE"] = pd.to_datetime(bruttoinlandsprodukt["Datum"], format="%d/%m/%Y").dt.strftime("%Y-%m-%d")
# bruttoinlandsprodukt = bruttoinlandsprodukt.drop(columns=["Datum"])

# # Reorder columns to have "DATE" first
# cols = ["DATE"] + [col for col in bruttoinlandsprodukt.columns if col != "DATE"]
# bruttoinlandsprodukt = bruttoinlandsprodukt[cols]
# bruttoinlandsprodukt = bruttoinlandsprodukt.rename(columns={
#   "in jeweiligen Preisen, Mrd. EUR, Originalwert": "BIP",
#   "preisbereinigt, Kettenindex (2020=100), Originalwert": "BIP preisbereinigt"
# })

# euribor
euribor_1y = pd.read_csv("./original data/Euribor_1Y.csv", sep=",", decimal=".")

# Convert DATE column to datetime (assuming format is YYYY-MM-DD)
euribor_1y["DATE"] = pd.to_datetime(euribor_1y["DATE"], format="%Y-%m-%d", errors="coerce")
euribor_1y["DATE"] = euribor_1y["DATE"] + pd.offsets.MonthEnd(0)
euribor_1y["DATE"] = euribor_1y["DATE"].dt.strftime("%Y-%m-%d")

euribor_1y = euribor_1y.rename(
    columns={
        "Euribor 1-year - Historical close, average of observations through period (FM.M.U2.EUR.RT.MM.EURIBOR1YD_.HSTA)": "Euribor 1Y"
    }
)
euribor_1y = euribor_1y.drop(columns=["TIME PERIOD"])

euribor_3m = pd.read_csv("./original data/Euribor_3M.csv", sep=",", decimal=".")
# Convert DATE column to datetime (assuming format is YYYY-MM-DD)
euribor_3m["DATE"] = pd.to_datetime(euribor_3m["DATE"], format="%Y-%m-%d", errors="coerce")
euribor_3m["DATE"] = euribor_3m["DATE"] + pd.offsets.MonthEnd(0)
euribor_3m["DATE"] = euribor_3m["DATE"].dt.strftime("%Y-%m-%d")
euribor_3m = euribor_3m.rename(
    columns={
        "Euribor 3-month - Historical close, average of observations through period (FM.M.U2.EUR.RT.MM.EURIBOR3MD_.HSTA)": "Euribor 3M"
    }
)
euribor_3m = euribor_3m.drop(columns=["TIME PERIOD"])

euribor_6m = pd.read_csv("./original data/Euribor_6M.csv", sep=",", decimal=".")
# Convert DATE column to datetime (assuming format is YYYY-MM-DD)
euribor_6m["DATE"] = pd.to_datetime(euribor_6m["DATE"], format="%Y-%m-%d", errors="coerce")
euribor_6m["DATE"] = euribor_6m["DATE"] + pd.offsets.MonthEnd(0)
euribor_6m["DATE"] = euribor_6m["DATE"].dt.strftime("%Y-%m-%d")
euribor_6m = euribor_6m.rename(
    columns={
        "Euribor 6-month - Historical close, average of observations through period (FM.M.U2.EUR.RT.MM.EURIBOR6MD_.HSTA)": "Euribor 6M"
    }
)
euribor_6m = euribor_6m.drop(columns=["TIME PERIOD"])


# gebrauchtwagenpreisindex
gebrauchtwagenpreisindex = pd.read_csv("./original data/Gebrauchtwagenpreisindex.csv")
gebrauchtwagenpreisindex["DATE"] = pd.to_datetime(gebrauchtwagenpreisindex["DATE"], errors="coerce") + pd.offsets.MonthEnd(0)
gebrauchtwagenpreisindex["DATE"] = gebrauchtwagenpreisindex["DATE"].dt.strftime("%Y-%m-%d")

# google trends
gt_auto_finanzierung = pd.read_csv("./original data/GT_Auto_Finanzierung.csv")
gt_auto_leasing = pd.read_csv("./original data/GT_Auto_Leasing.csv")
gt_bmw_finanzierung = pd.read_csv("./original data/GT_BMW_Finanzierung.csv")
gt_bmw_leasing = pd.read_csv("./original data/GT_BMW_Leasing.csv")
gt_bmw = pd.read_csv("./original data/GT_BMW.csv")

# Convert "Monat" columns in Google Trends data to "DATE" in YYYY-MM-DD format (set to last day of month)
def convert_monat_to_date(df, monat_col="DATE"):
  df["DATE"] = pd.to_datetime(df[monat_col] + "-01", format="%Y-%m-%d") + pd.offsets.MonthEnd(0)
  df["DATE"] = df["DATE"].dt.strftime("%Y-%m-%d")
  return df

gt_auto_finanzierung = convert_monat_to_date(gt_auto_finanzierung)
gt_auto_leasing = convert_monat_to_date(gt_auto_leasing)
gt_bmw_finanzierung = convert_monat_to_date(gt_bmw_finanzierung)
gt_bmw_leasing = convert_monat_to_date(gt_bmw_leasing)
gt_bmw = convert_monat_to_date(gt_bmw)

# verbraucherpreisindex
verbraucherpreisindex = pd.read_csv("./original data/Verbraucherpreisindex.csv", sep=";", decimal=",")
# Convert "Datum" to datetime, then set to last day of month
verbraucherpreisindex["DATE"] = pd.to_datetime(verbraucherpreisindex["Datum"], format="%d/%m/%Y") + pd.offsets.MonthEnd(0)
verbraucherpreisindex["DATE"] = verbraucherpreisindex["DATE"].dt.strftime("%Y-%m-%d")
verbraucherpreisindex = verbraucherpreisindex.drop(columns=["Datum"])
cols = ["DATE"] + [col for col in verbraucherpreisindex.columns if col != "DATE"]
verbraucherpreisindex = verbraucherpreisindex[cols]



Unnamed: 0,DATE,BIP,BIP preisbereinigt
0,2000-07-01,537.70,82.51
1,2000-10-01,547.59,83.34
2,2001-01-01,536.83,82.36
3,2001-04-01,538.24,82.34
4,2001-07-01,553.54,83.66
...,...,...,...
93,2023-10-01,1078.64,105.57
94,2024-01-01,1064.83,104.90
95,2024-04-01,1060.79,103.12
96,2024-07-01,1078.22,105.10


In [7]:
# Merge all dataframes on "DATE"

# List of all DataFrames to merge
from functools import reduce


dfs = [
    bmw_absatzzahlen,
    gt_auto_leasing,
    gt_auto_finanzierung,
    gt_bmw_leasing,
    gt_bmw_finanzierung,
    gt_bmw,
    euribor_3m,
    euribor_6m,
    euribor_1y,
    gebrauchtwagenpreisindex,
    arbeitslosenquote,
    beschaeftigungsbarometer,
    geschaeftsklima,
    # bruttoinlandsprodukt,
    verbraucherpreisindex
]

# Ensure all DATE columns are in datetime format
for df_item in dfs:
    if "DATE" in df_item.columns:
        df_item["DATE"] = pd.to_datetime(df_item["DATE"])

# Full outer join on the DATE column for all DataFrames
merged = reduce(lambda left, right: pd.merge(left, right, on="DATE", how="outer"), dfs)
merged.to_csv("./cleaned data/all_data.csv", index=False)
merged

Unnamed: 0,DATE,SALES,Auto Leasing,Auto Finanzierung,BMW Leasing,BMW Finanzierung,BMW,Euribor 3M,Euribor 6M,Euribor 1Y,...,Geschäftsklima,Geschäftslage,Geschäftserwartungen,Konjunkturampel,Verbraucherpreisindex,Index der Einzelhandelspreise,Index der Erzeugerpreise gewerblicher Produkte,Index der Growsshandelsverkaufspreise,Index der Einfuhrpreise,Index der Ausfuhrpreise
0,1994-01-31,,,,,,,6.9100,6.6700,6.3400,...,,,,,,,,,,
1,1994-02-28,,,,,,,6.8600,6.6800,6.4200,...,,,,,,,,,,
2,1994-03-31,,,,,,,6.7500,6.6500,6.5500,...,,,,,,,,,,
3,1994-04-30,,,,,,,6.5700,6.5100,6.4600,...,,,,,,,,,,
4,1994-05-31,,,,,,,6.2400,6.2100,6.2500,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,2025-01-31,13541.0,67.0,93.0,80.0,65.0,23.0,2.7031,2.6121,2.5221,...,85.2,86.0,84.3,26.9,120.3,122.8,1282,117.4,1152,1163
373,2025-02-28,12601.0,75.0,100.0,85.0,67.0,24.0,2.5249,2.4596,2.4069,...,85.3,85.0,85.6,30.2,120.8,123.1,1280,118.1,1155,1167
374,2025-03-31,14930.0,72.0,94.0,96.0,68.0,27.0,2.4424,2.3854,2.3984,...,86.7,85.7,87.7,69.9,121.2,123.7,1271,117.9,1143,1162
375,2025-04-30,16113.0,65.0,96.0,85.0,68.0,26.0,2.2482,2.2010,2.1424,...,86.9,86.4,87.4,70.6,121.7,123.9,.,117.8,.,.


In [10]:
# descriptive statistics
df = pd.read_csv('./cleaned data/all_data.csv')
df.columns = df.columns.str.strip()  # Strip whitespace from column names


df_ds = df.describe()
df_ds = df_ds.transpose()
df_ds

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SALES,124.0,18390.33,3214.064116,10913.0,15769.0,18571.5,20843.5,25536.0
Auto Leasing,125.0,48.688,16.397982,24.0,37.0,45.0,61.0,100.0
Auto Finanzierung,125.0,68.792,11.744797,42.0,62.0,66.0,74.0,100.0
BMW Leasing,125.0,70.68,11.457214,46.0,61.0,70.0,80.0,100.0
BMW Finanzierung,125.0,61.336,13.619213,39.0,51.0,59.0,71.0,100.0
BMW,125.0,23.976,7.334699,18.0,21.0,24.0,25.0,100.0
Euribor 3M,376.0,2.255088,2.170576,-0.582,0.093475,2.14585,3.937325,7.58
Euribor 6M,376.0,2.343862,2.14682,-0.5446,0.268775,2.19575,4.002225,7.74
Euribor 1Y,376.0,2.472024,2.137245,-0.5047,0.480125,2.33155,4.106,8.02
Gebrauchtwagenpreisindex,124.0,126.7952,18.987954,100.0,110.075,118.55,139.625,171.6


In [12]:
# seasonal adjustment
# data by IFO-Institut already seasonally adjusted

from statsmodels.tsa.seasonal import STL

# List of columns to be seasonally adjusted
seasonal_cols = [
    'SALES',
    'Auto Leasing',
    'Auto Finanzierung',
    'BMW Leasing',
    'BMW Finanzierung',
    'BMW',
    'Gebrauchtwagenpreisindex',
    'Bestand Arbeitslose',
    'Zugang Arbeitslose',
    'Abgang Arbeitslose',
    'Arbeitslosenquote',
    'Verbraucherpreisindex',
    'Index der Einzelhandelspreise',
    'Index der Erzeugerpreise gewerblicher Produkte',
    'Index der Grosshandelsverkaufspreise',
    'Index der Einfuhrpreise',
    'Index der Ausfuhrpreise'
]

df_sa = df.copy()
df_sa['DATE'] = pd.to_datetime(df_sa['DATE'])
df_sa = df_sa.set_index('DATE')

for col in seasonal_cols:
  stl = STL(df_sa[col], period=12, robust=True)
  res = stl.fit()
  df_sa[col] = res.trend + res.resid  # seasonally adjusted

df_sa = df_sa.reset_index()

df_sa

ValueError: could not convert string to float: '90,6'

In [None]:
# feature engineering
df_fe = df_sa.copy()
df_fe["DATE"] = pd.to_datetime(df_fe["DATE"])

# Create lagged features
absatz_idx = df_fe.columns.get_loc('Absatzzahlen')
for lag in range(1, 18):
  col_name = f'Absatzzahlen_t{lag}'
  df_fe.insert(absatz_idx + lag, col_name, df_fe['Absatzzahlen'].shift(lag))

#  Create growth rate features
for col in seasonal_cols:
  if col != 'Absatzzahlen':
    df_fe[f'{col}_growth'] = df_fe[col].pct_change()
df_fe

In [None]:
# modeling

# correlation
# Define target columns (Absatzzahlen and its lags)
target_cols = ['Absatzzahlen'] + [f'Absatzzahlen_t{i}' for i in range(1, 18)]

# Define other columns (all columns except DATE and target_cols)
other_cols = [col for col in df_fe.columns if col not in target_cols + ['DATE']]

correlation_methods = ['pearson', 'spearman']

for method in correlation_methods:
    # Calculate the correlation matrix between target_cols and other_cols
    correlation_matrix = df_fe[target_cols + other_cols].corr(method=method)

    # Select the relevant part of the correlation matrix
    # This shows the correlation of each 'other_col' with each 'target_col'
    correlation_subset = correlation_matrix.loc[other_cols, target_cols]

    # Plot the heatmap
    plt.figure(figsize=(16, 8))
    sns.heatmap(correlation_subset, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
    plt.title(f'{method.capitalize()} Correlation Heatmap: Absatzzahlen (and lags) vs. Other Variables')
    plt.show()

    print(f"\n{method.capitalize()} Correlation Subset:")
    display(correlation_subset)

In [None]:
# indexing
df_sa["DATE"] = pd.to_datetime(df_sa["DATE"])
df_sa = df_sa.set_index("DATE")

columns_to_index = [
    "Absatzzahlen",
    "BMW Finanzierung",
    "BMW Leasing",
    "Auto Finanzierung",
    "Auto Leasing",
    "Gebrauchtwagenindex",
    "Geschäftsklima",
    "Geschäftslage",
    "Geschäftserwartungen",
    "Verbraucherpreisindex",
    "Arbeitslosenquote",
    "Bestand Arbeitslose",
    "Zugang Arbeitslose",
    "Abgang Arbeitslose",
]

In [None]:
# plotting

In [None]:
# multiple linear regression analysis

In [None]:
# regression model