In [1]:
import pandas as pd

In [2]:
# Load data
ministry_file = 'project_chart4_ministries_budget.csv'
pension_file = 'project_chart4_pension_insurance.csv'

ministry_df = pd.read_csv(ministry_file)
pension_df = pd.read_csv(pension_file)

print(f"{len(ministry_df)} ministry-level records")
print(f"{len(pension_df)} pension insurance records")

310 ministry-level records
13 pension insurance records


In [4]:
# Remove irrelevant columns
columns_to_remove = ['Budgetnummer', 'Relativer_Wert_zu_Eltern_Prozent']
ministry_df = ministry_df.drop(columns=columns_to_remove, errors='ignore')
pension_df = pension_df.drop(columns=columns_to_remove, errors='ignore')

In [7]:
# Convert ID column to numeric
ministry_df['ID'] = pd.to_numeric(ministry_df['ID'], errors='coerce')
pension_df['ID'] = pd.to_numeric(pension_df['ID'], errors='coerce')

In [8]:
# Get years
years = sorted(ministry_df['Jahr'].unique())

In [9]:
# STEP 1: Calculate Total Federal Budget
total_rows = []
for year in years:
    ministry_year = ministry_df[ministry_df['Jahr'] == year]
    total_budget = ministry_year['Wert_Euro'].sum()

    total_row = {
        'Jahr': year,
        'ID': 0,
        'Kategorie': 'Total Federal Budget',
        'Wert_Euro': total_budget,
        'Relativer_Wert_Prozent': 100.0,
    }
    total_rows.append(total_row)
    print(f"  Year {year}: {total_budget:,.0f} €")

totals_df = pd.DataFrame(total_rows)

  Year 2014: 296,500,000,000 €
  Year 2015: 306,900,000,000 €
  Year 2016: 316,900,000,000 €
  Year 2017: 329,100,000,000 €
  Year 2018: 343,600,000,000 €
  Year 2019: 356,400,000,000 €
  Year 2020: 508,529,758,000 €
  Year 2021: 572,725,714,000 €
  Year 2022: 495,791,475,000 €
  Year 2023: 461,211,782,000 €
  Year 2024: 476,807,656,000 €
  Year 2025: 502,546,135,000 €
  Year 2026: 524,540,138,000 €


In [11]:
# STEP 2: Calculate Other Federal Budget (exclude 11, 14, 17)
excluded_ids = [11, 14, 17]
other_ministries_df = ministry_df[~ministry_df['ID'].isin(excluded_ids)]

total_other_rows = []
for year in years:
    other_year = other_ministries_df[other_ministries_df['Jahr'] == year]
    total_other = other_year['Wert_Euro'].sum()

    total_other_row = {
        'Jahr': year,
        'ID': 99,
        'Kategorie': 'Other Federal Budget',
        'Wert_Euro': total_other,
        'Relativer_Wert_Prozent': 0.0,
    }
    total_other_rows.append(total_other_row)
    print(f"  Year {year}: {total_other:,.0f} €")

total_other_df = pd.DataFrame(total_other_rows)

  Year 2014: 134,125,806,000 €
  Year 2015: 138,780,337,000 €
  Year 2016: 143,619,496,000 €
  Year 2017: 144,989,521,000 €
  Year 2018: 155,674,521,000 €
  Year 2019: 157,463,613,000 €
  Year 2020: 278,573,128,000 €
  Year 2021: 347,668,631,000 €
  Year 2022: 271,705,706,000 €
  Year 2023: 231,295,688,000 €
  Year 2024: 235,306,925,000 €
  Year 2025: 235,698,610,000 €
  Year 2026: 227,847,772,000 €


In [12]:
# STEP 3: Calculate Other Labour & Social (Ministry 11 minus Pension 1102)
other_11_rows = []
for year in years:
    # Get Labour & Social ministry total
    labour_social = ministry_df[(ministry_df['Jahr'] == year) & (ministry_df['ID'] == 11)]['Wert_Euro'].values
    labour_social_value = labour_social[0] if len(labour_social) > 0 else 0

    # Get Pension Insurance subcategory
    pension = pension_df[pension_df['Jahr'] == year]['Wert_Euro'].values
    pension_value = pension[0] if len(pension) > 0 else 0

    # Calculate difference
    other_value = labour_social_value - pension_value

    other_11_row = {
        'Jahr': year,
        'ID': 98,
        'Kategorie': 'Other Labour & Social Expenses',
        'Wert_Euro': other_value,
        'Relativer_Wert_Prozent': 0.0
    }
    other_11_rows.append(other_11_row)
    print(f"  Year {year}: {other_value:,.0f} € (Ministry 11: {labour_social_value:,.0f} - Pension: {pension_value:,.0f})")

other_11_df = pd.DataFrame(other_11_rows)

  Year 2014: 33,550,830,000 € (Ministry 11: 121,979,310,000 - Pension: 88,428,480,000)
  Year 2015: 35,943,656,000 € (Ministry 11: 126,309,918,000 - Pension: 90,366,262,000)
  Year 2016: 36,617,812,000 € (Ministry 11: 129,888,984,000 - Pension: 93,271,172,000)
  Year 2017: 39,330,170,000 € (Ministry 11: 137,582,419,000 - Pension: 98,252,249,000)
  Year 2018: 39,037,233,000 € (Ministry 11: 139,179,759,000 - Pension: 100,142,526,000)
  Year 2019: 39,931,302,000 € (Ministry 11: 145,260,251,000 - Pension: 105,328,949,000)
  Year 2020: 60,780,426,000 € (Ministry 11: 170,682,386,000 - Pension: 109,901,960,000)
  Year 2021: 50,251,075,000 € (Ministry 11: 164,920,480,000 - Pension: 114,669,405,000)
  Year 2022: 44,295,820,000 € (Ministry 11: 161,080,980,000 - Pension: 116,785,160,000)
  Year 2023: 45,179,870,000 € (Ministry 11: 166,229,393,000 - Pension: 121,049,523,000)
  Year 2024: 48,374,468,000 € (Ministry 11: 175,675,498,000 - Pension: 127,301,030,000)
  Year 2025: 55,918,513,000 € (Minis

In [15]:
# STEP 4: Build final_df
# Get specific ministries (14, 17)
ministry_ids = [14, 17]
selected_ministries = ministry_df[ministry_df['ID'].isin(ministry_ids)].copy()

# Combine all dataframes
final_df = pd.concat([
    other_11_df,
    pension_df,
    total_other_df,
    totals_df,
    selected_ministries
], ignore_index=True)

print(f"  ✓ Final dataset has {len(final_df)} total records")

# Sort by year and ID
final_df = final_df.sort_values(['Jahr', 'ID'])

  ✓ Final dataset has 78 total records


In [17]:
# STEP 5: Translate to English
translation_map = {
    '14 Bundesministerium der Verteidigung': 'Defense',
    '17 Bundesministerium für Bildung, Familie, Senioren, Frauen und Jugend': 'Education & Family',
    '17 Bundesministerium für Familie, Senioren, Frauen und Jugend': 'Education & Family'
}

final_df['Kategorie'] = final_df['Kategorie'].replace(translation_map)
print(f"  ✓ Translated categories")

  ✓ Translated categories


In [19]:
# STEP 6: Calculate Relativer_Wert_Prozent
for year in final_df['Jahr'].unique():
    # Get total budget for this year
    total_budget = final_df[(final_df['Jahr'] == year) & (final_df['Kategorie'] == 'Total Federal Budget')]['Wert_Euro'].values[0]

    # Calculate percentage for all rows in this year
    year_mask = final_df['Jahr'] == year
    final_df.loc[year_mask, 'Relativer_Wert_Prozent'] = (final_df.loc[year_mask, 'Wert_Euro'] / total_budget) * 100

print("  ✓ Calculated all percentages")

  ✓ Calculated all percentages


In [20]:
# STEP 7: Validation - Check if percentages sum to 100%
for year in sorted(final_df['Jahr'].unique()):
    year_data = final_df[(final_df['Jahr'] == year) & (final_df['ID'] != 0)]
    total_share = year_data['Relativer_Wert_Prozent'].sum()
    print(f"Year {year}: {total_share:.2f}%")

# Show final summary
print(final_df.groupby('Kategorie')['Wert_Euro'].count())

Year 2014: 100.00%
Year 2015: 100.00%
Year 2016: 100.00%
Year 2017: 100.00%
Year 2018: 100.00%
Year 2019: 100.00%
Year 2020: 100.00%
Year 2021: 100.00%
Year 2022: 100.00%
Year 2023: 100.00%
Year 2024: 100.00%
Year 2025: 100.00%
Year 2026: 100.00%
Kategorie
Defense                           13
Education & Family                13
Other Federal Budget              13
Other Labour & Social Expenses    13
Subsidies Pension Insurance       13
Total Federal Budget              13
Name: Wert_Euro, dtype: int64


In [21]:
# STEP 8: Export to CSV
output_file = 'project_chart4_federal_budget.csv'
final_df.to_csv(output_file, index=False)
print(f"\n✓ Success! Data saved to '{output_file}'")
print(f"  Total records: {len(final_df)}")

# Show sample data
print(final_df[final_df['Jahr'] == 2014][['Jahr', 'Kategorie', 'Wert_Euro', 'Relativer_Wert_Prozent']])


✓ Success! Data saved to 'project_chart4_federal_budget.csv'
  Total records: 78
    Jahr                       Kategorie     Wert_Euro  Relativer_Wert_Prozent
39  2014            Total Federal Budget  2.965000e+11              100.000000
52  2014                         Defense  3.243538e+10               10.939419
53  2014              Education & Family  7.959508e+09                2.684488
0   2014  Other Labour & Social Expenses  3.355083e+10               11.315626
26  2014            Other Federal Budget  1.341258e+11               45.236360
13  2014     Subsidies Pension Insurance  8.842848e+10               29.824108


In [22]:
# Combining Pension Spending & Tax Revenue

In [26]:
tax_revenue_df = pd.read_csv('project_chart4_federal_tax_revenue.csv')
print(f"✓ Loaded tax revenue data: {len(tax_revenue_df)} records")
print(tax_revenue_df.head(11))

✓ Loaded tax revenue data: 11 records
    time      value value_unit
0   2014  270746283   Tsd. EUR
1   2015  281607698   Tsd. EUR
2   2016  289017935   Tsd. EUR
3   2017  309361188   Tsd. EUR
4   2018  322358667   Tsd. EUR
5   2019  329052167   Tsd. EUR
6   2020  283114831   Tsd. EUR
7   2021  313667447   Tsd. EUR
8   2022  337209479   Tsd. EUR
9   2023  356041655   Tsd. EUR
10  2024  374948728   Tsd. EUR


In [27]:
# Get only Pension Insurance from final_df
pension_only = final_df[final_df['Kategorie'] == 'Subsidies Pension Insurance'][['Jahr', 'Wert_Euro']].copy()
pension_only = pension_only.rename(columns={'Wert_Euro': 'Pension_Spending_Euro'})

In [28]:
# Prepare tax revenue data
tax_clean = tax_revenue_df[['time', 'value']].copy()
tax_clean = tax_clean.rename(columns={'time': 'Jahr', 'value': 'Tax_Revenue_Tsd_Euro'})

In [31]:
# Convert from Tsd. EUR to EUR
tax_clean['Tax_Revenue_Euro'] = tax_clean['Tax_Revenue_Tsd_Euro'] * 1000
tax_clean = tax_clean.drop(columns=['Tax_Revenue_Tsd_Euro'])

In [33]:
# Merge both
combined_df = pd.merge(tax_clean, pension_only, on='Jahr', how='inner')

In [40]:
# Convert to Billions for better readability
combined_df['Tax_Revenue_Billion'] = (combined_df['Tax_Revenue_Euro'] / 1_000_000_000).round(2)
combined_df['Pension_Spending_Billion'] = (combined_df['Pension_Spending_Euro'] / 1_000_000_000).round(2)

In [42]:
# Calculate pension as percentage of tax revenue
combined_df['Pension_Percent_of_Tax'] = (combined_df['Pension_Spending_Euro'] / combined_df['Tax_Revenue_Euro'] * 100).round(1)

print(f"✓ Combined data: {len(combined_df)} years")
print(combined_df[['Jahr', 'Tax_Revenue_Billion', 'Pension_Spending_Billion', 'Pension_Percent_of_Tax']])

✓ Combined data: 11 years
    Jahr  Tax_Revenue_Billion  Pension_Spending_Billion  \
0   2014               270.75                     88.43   
1   2015               281.61                     90.37   
2   2016               289.02                     93.27   
3   2017               309.36                     98.25   
4   2018               322.36                    100.14   
5   2019               329.05                    105.33   
6   2020               283.11                    109.90   
7   2021               313.67                    114.67   
8   2022               337.21                    116.79   
9   2023               356.04                    121.05   
10  2024               374.95                    127.30   

    Pension_Percent_of_Tax  
0                     32.7  
1                     32.1  
2                     32.3  
3                     31.8  
4                     31.1  
5                     32.0  
6                     38.8  
7                     36.6  
8   

In [50]:
# Create LONG FORMAT for Vega-Lite grouped bar chart
long_df = pd.melt(
    combined_df[['Jahr', 'Tax_Revenue_Billion', 'Pension_Spending_Billion', 'Pension_Percent_of_Tax']],
    id_vars=['Jahr', 'Pension_Percent_of_Tax'],
    value_vars=['Tax_Revenue_Billion', 'Pension_Spending_Billion'],
    var_name='Category',
    value_name='Amount_Billion_Euro'
)

In [51]:
# Rename for display
long_df['Category'] = long_df['Category'].replace({
    'Tax_Revenue_Billion': 'Tax Revenue',
    'Pension_Spending_Billion': 'Pension Spending'
})

In [52]:
# Set Pension_Percent_of_Tax to None for Tax Revenue rows
long_df.loc[long_df['Category'] == 'Tax Revenue', 'Pension_Percent_of_Tax'] = None

print("\n✓ Long format for chart:")
print(long_df)


✓ Long format for chart:
    Jahr  Pension_Percent_of_Tax          Category  Amount_Billion_Euro
0   2014                     NaN       Tax Revenue               270.75
1   2015                     NaN       Tax Revenue               281.61
2   2016                     NaN       Tax Revenue               289.02
3   2017                     NaN       Tax Revenue               309.36
4   2018                     NaN       Tax Revenue               322.36
5   2019                     NaN       Tax Revenue               329.05
6   2020                     NaN       Tax Revenue               283.11
7   2021                     NaN       Tax Revenue               313.67
8   2022                     NaN       Tax Revenue               337.21
9   2023                     NaN       Tax Revenue               356.04
10  2024                     NaN       Tax Revenue               374.95
11  2014                    32.7  Pension Spending                88.43
12  2015                    32.1  Pens

In [53]:
# Save for chart
output_file = 'project_chart4_tax_vs_pension.csv'
long_df.to_csv(output_file, index=False)
print(f"\n✓ Saved to: {output_file}")


✓ Saved to: project_chart4_tax_vs_pension.csv
