In [2]:
import requests
import pandas as pd
import io
import zipfile
import numpy as np

In [7]:
BASE_URL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/'
USERNAME = "6d842c23ee654b4dbaf333d1b995fde4"
PASSWORD = ""

headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'username': USERNAME,
    'password': PASSWORD
}

# Download the data
responseTable = requests.post(
    BASE_URL + 'data/tablefile',
    headers=headers,
    data={
        'name': '12411-0018',
        'area': 'all',
        'compress': 'false',
        'transpose': 'false',
        'startyear': '2024',
        'endyear': '2024',
        'regionalvariable': 'KREISE',
        'regionalkey': '',
        'classifyingvariable1': 'GES',
        'classifyingkey1': '',
        'classifyingvariable2': 'ALTX20',
        'classifyingkey2': '',
        'format': 'ffcsv',
        'job': 'false',
        'stand': '',
        'language': 'de'
    }
)

if responseTable.status_code == 200:
    print("File downloaded successfully")

    # Extract and read the data
    with zipfile.ZipFile(io.BytesIO(responseTable.content)) as z:
        csv_filename = z.namelist()[0]

        with z.open(csv_filename) as csv_file:
            # Read the raw data
            df = pd.read_csv(csv_file,
                            sep=';',
                            encoding='utf-8',
                            low_memory=False)

            print(f"Original data shape: {df.shape}")
            print(f"Number of rows: {len(df)}")

            # Filter for 2024 data only (should already be filtered, but just to be sure)
            df_2024 = df[df['time'] == '2024-12-31'].copy()

            # Create a pivot table
            # Rows: Districts (Kreise)
            # Columns: Gender + Age groups
            # Values: Population count

            # First, let's see what we have
            print("\nUnique districts:", df_2024['1_variable_attribute_code'].nunique())
            print("Unique genders:", df_2024['2_variable_attribute_label'].unique())
            print("Unique age groups:", df_2024['3_variable_attribute_label'].nunique())

            # Create combined column name for Gender + Age group
            df_2024['gender_age'] = (df_2024['2_variable_attribute_label'] + '_' +
                                     df_2024['3_variable_attribute_label'])

            # Pivot the data
            pivot_df = df_2024.pivot_table(
                index=['1_variable_attribute_code', '1_variable_attribute_label'],
                columns='gender_age',
                values='value',
                aggfunc='first'  # Take first value if there are duplicates
            )

            # Reset index to make district columns regular columns
            pivot_df = pivot_df.reset_index()

            # Rename the first two columns
            pivot_df.columns.name = None  # Remove the 'gender_age' name from columns
            pivot_df = pivot_df.rename(columns={
                '1_variable_attribute_code': 'district_code',
                '1_variable_attribute_label': 'district_name'
            })

            print(f"\nReshaped data shape: {pivot_df.shape}")
            print(f"Districts (rows): {len(pivot_df)}")
            print(f"Age-Gender combinations (columns): {len(pivot_df.columns) - 2}")

            # Show sample
            print("\nFirst few rows and columns:")
            print(pivot_df.iloc[:5, :8])

            # Save the reshaped data
            pivot_df.to_csv("population_2024_by_district_age_gender.csv",
                           index=False, encoding='utf-8-sig')
            print("\n✓ Saved to: population_2024_by_district_age_gender.csv")

            # Also create a version with totals (all genders combined)
            df_total = df_2024[df_2024['2_variable_attribute_label'] == 'Insgesamt'].copy()

            if len(df_total) > 0:
                pivot_total = df_total.pivot_table(
                    index=['1_variable_attribute_code', '1_variable_attribute_label'],
                    columns='3_variable_attribute_label',
                    values='value',
                    aggfunc='first'
                )

                pivot_total = pivot_total.reset_index()
                pivot_total.columns.name = None
                pivot_total = pivot_total.rename(columns={
                    '1_variable_attribute_code': 'district_code',
                    '1_variable_attribute_label': 'district_name'
                })

                print(f"\n\nTotal population (all genders) shape: {pivot_total.shape}")
                print("\nFirst few rows:")
                print(pivot_total.head())

                pivot_total.to_csv("population_2024_by_district_age_total.csv",
                                  index=False, encoding='utf-8-sig')
                print("\n✓ Saved to: population_2024_by_district_age_total.csv")

            # Print column names for reference
            print("\n\nAll column names in the gender-specific file:")
            for i, col in enumerate(pivot_df.columns, 1):
                print(f"{i}. {col}")

else:
    print(f"Error: Status code {responseTable.status_code}")

File downloaded successfully
Original data shape: (16184, 21)
Number of rows: 16184

Unique districts: 476
Unique genders: ['männlich' 'weiblich']
Unique age groups: 17

Reshaped data shape: (476, 36)
Districts (rows): 476
Age-Gender combinations (columns): 34

First few rows and columns:
   district_code                 district_name männlich_10 bis unter 15 Jahre  \
0           1001   Flensburg, kreisfreie Stadt                           2043   
1           1002        Kiel, kreisfreie Stadt                           5107   
2           1003      Lübeck, kreisfreie Stadt                           4794   
3           1004  Neumünster, kreisfreie Stadt                           1818   
4           1051       Dithmarschen, Landkreis                           3056   

  männlich_15 bis unter 18 Jahre männlich_18 bis unter 20 Jahre  \
0                           1359                            991   
1                           3181                           2533   
2                     

In [8]:
df= pd.read_csv('population_2024_by_district_age_gender.csv')

In [9]:
df.head()

Unnamed: 0,district_code,district_name,männlich_10 bis unter 15 Jahre,männlich_15 bis unter 18 Jahre,männlich_18 bis unter 20 Jahre,männlich_20 bis unter 25 Jahre,männlich_25 bis unter 30 Jahre,männlich_3 bis unter 6 Jahre,männlich_30 bis unter 35 Jahre,männlich_35 bis unter 40 Jahre,...,weiblich_35 bis unter 40 Jahre,weiblich_40 bis unter 45 Jahre,weiblich_45 bis unter 50 Jahre,weiblich_50 bis unter 55 Jahre,weiblich_55 bis unter 60 Jahre,weiblich_6 bis unter 10 Jahre,weiblich_60 bis unter 65 Jahre,weiblich_65 bis unter 75 Jahre,weiblich_75 Jahre und mehr,weiblich_unter 3 Jahre
0,1001,"Flensburg, kreisfreie Stadt",2043,1359,991,3564,4294,1310,3980,3475,...,3066,2841,2603,2802,3496,1624,3170,5089,5800,1144
1,1002,"Kiel, kreisfreie Stadt",5107,3181,2533,9742,12416,3212,11067,9410,...,8187,7647,6804,7258,8932,4173,8514,12978,14886,3022
2,1003,"Lübeck, kreisfreie Stadt",4794,2983,2199,6141,7646,2701,7443,7115,...,7058,6796,6501,6860,8762,3540,8360,13233,16449,2316
3,1004,"Neumünster, kreisfreie Stadt",1818,1251,962,2430,2761,1042,2740,2711,...,2466,2341,2256,2523,3233,1326,3082,4794,5686,933
4,1051,"Dithmarschen, Landkreis",3056,1955,1309,3405,3749,1725,3849,3943,...,3632,3652,3685,4334,5968,2330,5835,9321,9848,1500


In [10]:
df.columns.tolist()

['district_code',
 'district_name',
 'männlich_10 bis unter 15 Jahre',
 'männlich_15 bis unter 18 Jahre',
 'männlich_18 bis unter 20 Jahre',
 'männlich_20 bis unter 25 Jahre',
 'männlich_25 bis unter 30 Jahre',
 'männlich_3 bis unter 6 Jahre',
 'männlich_30 bis unter 35 Jahre',
 'männlich_35 bis unter 40 Jahre',
 'männlich_40 bis unter 45 Jahre',
 'männlich_45 bis unter 50 Jahre',
 'männlich_50 bis unter 55 Jahre',
 'männlich_55 bis unter 60 Jahre',
 'männlich_6 bis unter 10 Jahre',
 'männlich_60 bis unter 65 Jahre',
 'männlich_65 bis unter 75 Jahre',
 'männlich_75 Jahre und mehr',
 'männlich_unter 3 Jahre',
 'weiblich_10 bis unter 15 Jahre',
 'weiblich_15 bis unter 18 Jahre',
 'weiblich_18 bis unter 20 Jahre',
 'weiblich_20 bis unter 25 Jahre',
 'weiblich_25 bis unter 30 Jahre',
 'weiblich_3 bis unter 6 Jahre',
 'weiblich_30 bis unter 35 Jahre',
 'weiblich_35 bis unter 40 Jahre',
 'weiblich_40 bis unter 45 Jahre',
 'weiblich_45 bis unter 50 Jahre',
 'weiblich_50 bis unter 55 Jahre',
 

In [11]:
print(f"Original data shape: {df.shape}")
print(f"Number of districts: {len(df)}")

Original data shape: (476, 36)
Number of districts: 476


In [12]:
# Load your data
df = pd.read_csv("population_2024_by_district_age_gender.csv")

print(f"Original data shape: {df.shape}")
print(f"Number of districts: {len(df)}")

# Convert all numeric columns to float
numeric_cols = [col for col in df.columns if col not in ['district_code', 'district_name']]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Define age bracket mappings - OPTION C
age_bracket_mapping = {
    '0-20': [
        'unter 3 Jahre',
        '3 bis unter 6 Jahre',
        '6 bis unter 10 Jahre',
        '10 bis unter 15 Jahre',
        '15 bis unter 18 Jahre',
        '18 bis unter 20 Jahre'
    ],
    '20-55': [
        '20 bis unter 25 Jahre',
        '25 bis unter 30 Jahre',
        '30 bis unter 35 Jahre',
        '35 bis unter 40 Jahre',
        '40 bis unter 45 Jahre',
        '45 bis unter 50 Jahre',
        '50 bis unter 55 Jahre'
    ],
    '55-75': [
        '55 bis unter 60 Jahre',
        '60 bis unter 65 Jahre',
        '65 bis unter 75 Jahre'
    ],
    '75+': [
        '75 Jahre und mehr'
    ]
}

# Create new dataframe with district info
result_df = df[['district_code', 'district_name']].copy()

# For each age bracket, sum across genders and age groups
for bracket_name, age_groups in age_bracket_mapping.items():
    bracket_total = pd.Series([0.0] * len(df), index=df.index, dtype=float)

    for age_group in age_groups:
        male_col = f'männlich_{age_group}'
        if male_col in df.columns:
            bracket_total = bracket_total + df[male_col].fillna(0)

        female_col = f'weiblich_{age_group}'
        if female_col in df.columns:
            bracket_total = bracket_total + df[female_col].fillna(0)

    result_df[bracket_name] = bracket_total

# Calculate total population per district
result_df['total_population'] = result_df[['0-20', '20-55', '55-75', '75+']].sum(axis=1)

# Calculate percentage share for each age bracket
for bracket_name in age_bracket_mapping.keys():
    result_df[f'{bracket_name}_percentage'] = (
        result_df[bracket_name] / result_df['total_population'] * 100
    ).round(2)

# Rename columns to English
result_df = result_df.rename(columns={
    '0-20': 'age_0_20',
    '20-55': 'age_20_55',
    '55-75': 'age_55_75',
    '75+': 'age_75_plus',
    '0-20_percentage': 'age_0_20_pct',
    '20-55_percentage': 'age_20_55_pct',
    '55-75_percentage': 'age_55_75_pct',
    '75+_percentage': 'age_75_plus_pct'
})

# Convert district codes to 5-digit format with leading zeros
result_df['district_code'] = result_df['district_code'].astype(str).str.zfill(5)

# Display results
print("\n" + "="*70)
print("TRANSFORMED DATA")
print("="*70)
print(f"\nTransformed data shape: {result_df.shape}")
print("\nColumn names:")
for i, col in enumerate(result_df.columns, 1):
    print(f"{i}. {col}")

print("\nFirst 10 rows:")
print(result_df.head(10))

print("\nSummary statistics for percentage shares:")
print(result_df[['age_0_20_pct', 'age_20_55_pct', 'age_55_75_pct',
                 'age_75_plus_pct']].describe())

# Check for any NaN values
print("\nMissing values check:")
print(result_df.isnull().sum())

# Verify all codes are 5 digits
print(f"\nAll district codes are 5 digits: {(result_df['district_code'].str.len() == 5).all()}")

# Save the transformed data
result_df.to_csv("population_2024_age_brackets_for_map.csv",
                 index=False, encoding='utf-8-sig')
print("\n✓ Saved to: population_2024_age_brackets_for_map.csv")

# Create simplified version with only percentages (for the map)
map_df = result_df[['district_code', 'district_name', 'total_population',
                     'age_0_20_pct', 'age_20_55_pct', 'age_55_75_pct',
                     'age_75_plus_pct']].copy()

map_df.to_csv("population_2024_percentages_for_map_corrected.csv",
              index=False, encoding='utf-8-sig')
print("✓ Saved simplified version to: population_2024_percentages_for_map_corrected.csv")

# Show insights
print("\n" + "="*70)
print("KEY INSIGHTS FOR YOUR STORYLINE")
print("="*70)

age_brackets = {
    'age_0_20_pct': '0-20 years (Youth)',
    'age_20_55_pct': '20-55 years (Working age)',
    'age_55_75_pct': '55-75 years (Retiring generation)',
    'age_75_plus_pct': '75+ years (Elderly)'
}

for bracket_col, bracket_label in age_brackets.items():
    print(f"\n{bracket_label}:")
    print(f"  Highest: {result_df.loc[result_df[bracket_col].idxmax(), 'district_name']}: "
          f"{result_df[bracket_col].max():.2f}%")
    print(f"  Lowest:  {result_df.loc[result_df[bracket_col].idxmin(), 'district_name']}: "
          f"{result_df[bracket_col].min():.2f}%")
    print(f"  Average: {result_df[bracket_col].mean():.2f}%")

Original data shape: (476, 36)
Number of districts: 476

TRANSFORMED DATA

Transformed data shape: (476, 11)

Column names:
1. district_code
2. district_name
3. age_0_20
4. age_20_55
5. age_55_75
6. age_75_plus
7. total_population
8. age_0_20_pct
9. age_20_55_pct
10. age_55_75_pct
11. age_75_plus_pct

First 10 rows:
  district_code                   district_name  age_0_20  age_20_55  \
0         01001     Flensburg, kreisfreie Stadt   17040.0    46504.0   
1         01002          Kiel, kreisfreie Stadt   42329.0   126950.0   
2         01003        Lübeck, kreisfreie Stadt   36624.0    95546.0   
3         01004    Neumünster, kreisfreie Stadt   14490.0    34325.0   
4         01051         Dithmarschen, Landkreis   23515.0    51642.0   
5         01053  Herzogtum Lauenburg, Landkreis   40525.0    83257.0   
6         01054        Nordfriesland, Landkreis   29920.0    66802.0   
7         01055          Ostholstein, Landkreis   32010.0    74139.0   
8         01056            Pinnebe