In [118]:
import pandas as pd
import os

In [119]:
# Read the original CSV files with SEMICOLON delimiter
df_male = pd.read_csv('german_population_male.csv', sep=';', decimal=',')
df_female = pd.read_csv('german_population_female.csv', sep=';', decimal=',')

In [120]:
df_male

Unnamed: 0,year,gender,total_population,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,1950,m,31962.0,542.0,518.0,483.0,460.0,404.0,379.0,512.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1951,m,32198.0,541.0,535.0,520.0,486.0,460.0,408.0,370.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1952,m,32409.0,544.0,536.0,533.0,520.0,485.0,458.0,409.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1953,m,32635.0,540.0,540.0,535.0,532.0,520.0,485.0,458.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1954,m,32846.0,549.0,535.0,539.0,535.0,532.0,519.0,484.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,2066,m,37243.9,450.3,450.1,448.9,446.6,443.3,439.4,435.1,...,146.3,121.2,101.9,83.1,71.3,60.6,46.9,36.7,27.1,19.0
1405,2067,m,37210.5,449.2,450.2,450.1,448.8,446.5,443.3,439.4,...,151.7,126.2,102.4,84.1,66.9,55.9,46.2,34.9,26.6,19.1
1406,2068,m,37177.8,447.3,449.2,450.2,450.1,448.8,446.5,443.3,...,155.3,131.0,106.7,84.6,67.8,52.5,42.8,34.5,25.3,18.8
1407,2069,m,37144.4,444.7,447.2,449.2,450.2,450.1,448.8,446.5,...,159.7,134.3,110.9,88.3,68.3,53.3,40.2,31.9,25.1,17.9


In [121]:
# Remove 'gender' column if it exists
if 'gender' in df_male.columns:
    df_male = df_male.drop(columns=['gender'])
if 'gender' in df_female.columns:
    df_female = df_female.drop(columns=['gender'])

In [122]:
# Multiply times 1000 to adjust format
def convert_to_persons(df):
    for col in df.columns:
        if col != 'year':
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Multipliziere mit 1000 um echte Personenzahl zu bekommen
            df[col] = df[col] * 1000
    return df

df_male = convert_to_persons(df_male)
df_female = convert_to_persons(df_female)

print(f"\nMale data shape: {df_male.shape}")
print(f"Female data shape: {df_female.shape}")


Male data shape: (1409, 102)
Female data shape: (1409, 102)


In [123]:
# Example values after conversion
print("\n")
if '18' in df_male.columns:
    print(df_male[df_male['year'].isin([2021, 2022, 2023])][['year', '18', '19', '20']])
else:
    print("column not found", df_male.columns.tolist()[:10])



    year        18        19        20
71  2021  399000.0  413000.0  432000.0
72  2022  406500.0  401200.0  416400.0
73  2023  414800.0  425200.0  419700.0


In [124]:
# Filter to years 1950-2070
df_male_filtered = df_male[(df_male['year'] >= 1950) & (df_male['year'] <= 2070)].copy()
df_female_filtered = df_female[(df_female['year'] >= 1950) & (df_female['year'] <= 2070)].copy()

print(f"Male rows after filtering: {len(df_male_filtered)}")
print(f"Female rows after filtering: {len(df_female_filtered)}")

Male rows after filtering: 1409
Female rows after filtering: 1409


In [135]:
# Get all age columns (should be 0, 1, 2, ..., 99)
age_columns = [col for col in df_male_filtered.columns if col != 'year']
print(f"\nAge columns found: {len(age_columns)}")
print(f"First 10 age columns: {age_columns[:100]}")


Age columns found: 101
First 10 age columns: ['total_population', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98']


In [136]:
# Transform MALE data to long format
df_male_long = pd.melt(
    df_male_filtered,
    id_vars=['year'],
    value_vars=age_columns,
    var_name='age',
    value_name='population'
)

# Convert age to integer
df_male_long = df_male_long[df_male_long['age'] != 'total_population']
df_male_long = df_male_long[df_male_long['age'] != 'Bev']
df_male_long['age'] = pd.to_numeric(df_male_long['age'], errors='coerce')
df_male_long = df_male_long.dropna(subset=['age'])
df_male_long['age'] = df_male_long['age'].astype(int)
df_male_long = df_male_long.drop_duplicates(subset=['year', 'age'], keep='first')
df_male_long = df_male_long.sort_values(['year', 'age']).reset_index(drop=True)
print(f"Male long format: {len(df_male_long)} rows")
print(df_male_long.head(10))

Male long format: 12100 rows
   year  age  population
0  1950    0    542000.0
1  1950    1    518000.0
2  1950    2    483000.0
3  1950    3    460000.0
4  1950    4    404000.0
5  1950    5    379000.0
6  1950    6    512000.0
7  1950    7    523000.0
8  1950    8    529000.0
9  1950    9    633000.0


In [137]:
df_male_long

Unnamed: 0,year,age,population
0,1950,0,542000.0
1,1950,1,518000.0
2,1950,2,483000.0
3,1950,3,460000.0
4,1950,4,404000.0
...,...,...,...
12095,2070,95,54200.0
12096,2070,96,41100.0
12097,2070,97,30200.0
12098,2070,98,23400.0


In [152]:
# Transform FEMALE data to long format
df_female_long = pd.melt(
    df_female_filtered,
    id_vars=['year'],
    value_vars=age_columns,
    var_name='age',
    value_name='population'
)

# Convert age to integer
df_female_long = df_female_long[df_female_long['age'] != 'total_population']
df_female_long = df_female_long[df_female_long['age'] != 'Bev']
df_female_long['age'] = pd.to_numeric(df_female_long['age'], errors='coerce')
df_female_long = df_female_long.dropna(subset=['age'])
df_female_long['age'] = df_female_long['age'].astype(int)
df_female_long = df_female_long.drop_duplicates(subset=['year', 'age'], keep='first')
df_female_long = df_female_long.sort_values(['year', 'age']).reset_index(drop=True)
print(f"Female long format: {len(df_female_long)} rows")
print(df_female_long.head(-10))

Female long format: 12100 rows
       year  age  population
0      1950    0    512000.0
1      1950    1    492000.0
2      1950    2    459000.0
3      1950    3    439000.0
4      1950    4    387000.0
...     ...  ...         ...
12085  2070   85    363800.0
12086  2070   86    342100.0
12087  2070   87    320900.0
12088  2070   88    301800.0
12089  2070   89    274300.0

[12090 rows x 3 columns]


In [153]:
df_female_long

Unnamed: 0,year,age,population
0,1950,0,512000.0
1,1950,1,492000.0
2,1950,2,459000.0
3,1950,3,439000.0
4,1950,4,387000.0
...,...,...,...
12095,2070,95,96200.0
12096,2070,96,77200.0
12097,2070,97,59700.0
12098,2070,98,48300.0


In [154]:
# Create COMBINED data (male + female)
df_both_long = df_male_long.copy()
df_both_long['population'] = df_male_long['population'] + df_female_long['population']

print(f"Combined long format: {len(df_both_long)} rows")
print(df_both_long.head(10))

Combined long format: 12100 rows
   year  age  population
0  1950    0   1054000.0
1  1950    1   1010000.0
2  1950    2    942000.0
3  1950    3    899000.0
4  1950    4    791000.0
5  1950    5    748000.0
6  1950    6   1004000.0
7  1950    7   1023000.0
8  1950    8   1036000.0
9  1950    9   1239000.0


In [155]:
# Verify the addition
print("\nVerification (year 2021, age 18):")
try:
    male_val = df_male_long[(df_male_long['year'] == 2021) & (df_male_long['age'] == 18)]['population'].values[0]
    female_val = df_female_long[(df_female_long['year'] == 2021) & (df_female_long['age'] == 18)]['population'].values[0]
    both_val = df_both_long[(df_both_long['year'] == 2021) & (df_both_long['age'] == 18)]['population'].values[0]
    print(f"Male: {male_val:,.0f}")
    print(f"Female: {female_val:,.0f}")
    print(f"Both: {both_val:,.0f}")
    print(f"Check: {male_val + female_val:,.0f} (should equal {both_val:,.0f})")
except:
    print("Year 2021 / Age 18 nicht gefunden")


Verification (year 2021, age 18):
Male: 399,000
Female: 374,000
Both: 773,000
Check: 773,000 (should equal 773,000)


In [130]:
# Additional verification for 2022 and beyond
print("\nVerification (year 2022, age 18):")
try:
    male_val_2022 = df_male_long[(df_male_long['year'] == 2022) & (df_male_long['age'] == 18)]['population'].values[0]
    female_val_2022 = df_female_long[(df_female_long['year'] == 2022) & (df_female_long['age'] == 18)]['population'].values[0]
    both_val_2022 = df_both_long[(df_both_long['year'] == 2022) & (df_both_long['age'] == 18)]['population'].values[0]
    print(f"Male: {male_val_2022:,.0f}")
    print(f"Female: {female_val_2022:,.0f}")
    print(f"Both: {both_val_2022:,.0f}")
except:
    print("Year 2022 / Age 18 nicht gefunden")


Verification (year 2022, age 18):
Male: 406,500
Female: 379,400
Both: 785,900


In [156]:
# Check for reasonable values
print(f"Max male population value: {df_male_long['population'].max():,.0f}")
print(f"Max female population value: {df_female_long['population'].max():,.0f}")
print(f"Max both population value: {df_both_long['population'].max():,.0f}")
print(f"Min male population value: {df_male_long['population'].min():,.0f}")
print(f"Min female population value: {df_female_long['population'].min():,.0f}")
print(f"Min both population value: {df_both_long['population'].min():,.0f}")

Max male population value: 771,000
Max female population value: 716,000
Max both population value: 1,485,000
Min male population value: 0
Min female population value: 0
Min both population value: 0


In [132]:
# Save as JSON (compact, no indentation)
df_male_long.to_json('cc10_chart1_german_population_male_long_final.json', orient='records')
df_female_long.to_json('cc10_chart1_german_population_female_long_final.json', orient='records')
df_both_long.to_json('cc10_chart1_german_population_both_long_final.json', orient='records')

In [133]:
# Save as CSV with COMMA delimiter (standard for web use)
df_male_long.to_csv('cc10_chart1_german_population_male_long_final.csv', index=False)
df_female_long.to_csv('cc10_chart1_german_population_female_long_final.csv', index=False)
df_both_long.to_csv('cc10_chart1_german_population_both_long_final.csv', index=False)