In [27]:
import pandas as pd
import numpy as np

In [28]:
# Load the data
df = pd.read_csv('project_chart1_life_expectancy_male.csv', sep=';')

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,1960,1961,Unnamed: 3,1962,Unnamed: 5,1963,Unnamed: 7,1964,Unnamed: 9,...,2019,Unnamed: 117,2020,Unnamed: 119,2021,Unnamed: 121,2022,Unnamed: 123,2023,Unnamed: 125
0,0 Jahre,74.07,74.43,e,74.84,e,75.22,e,75.57,e,...,80.69,e,80.73,e,80.73,e,80.75,e,80.76,e
1,1 Jahr,76.04,76.19,e,76.39,e,76.61,e,76.76,e,...,79.98,e,79.98,e,80.0,e,80.01,e,80.02,e
2,2 Jahre,75.23,75.36,e,75.56,e,75.76,e,75.91,e,...,78.99,e,79.0,e,79.01,e,79.02,e,79.03,e
3,3 Jahre,74.33,74.46,e,74.66,e,74.85,e,74.99,e,...,78.0,e,78.01,e,78.02,e,78.03,e,78.04,e
4,4 Jahre,73.41,73.53,e,73.72,e,73.92,e,74.06,e,...,77.01,e,77.02,e,77.03,e,77.04,e,77.05,e


In [30]:
# Remove unnamed columns EXCEPT "Unnamed: 0" (which contains the age groups)
# Keep "Unnamed: 0" and all columns that don't start with "Unnamed:"
cols_to_keep = [col for col in df.columns if col == 'Unnamed: 0' or not col.startswith('Unnamed:')]
df_cleaned = df[cols_to_keep]

print(f"\nAfter removing unnamed columns (except Unnamed: 0): {df_cleaned.shape}")
print("Remaining columns:")
print(df_cleaned.columns.tolist())
print("\nCleaned data preview:")
print(df_cleaned.head())


After removing unnamed columns (except Unnamed: 0): (101, 64)
Remaining columns:
['Unnamed: 0', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

Cleaned data preview:
  Unnamed: 0   1960   1961   1962   1963   1964   1965   1966   1967   1968  \
0    0 Jahre  74.07  74.43  74.84  75.22  75.57  75.86  76.04  76.23  76.38   
1     1 Jahr  76.04  76.19  76.39  76.61  76.76  76.93  77.07  77.21  77.32   
2    2 Jahre  75.23  75.36  75.56  75.76  75.91  76.07  76.21  76.34  76.46   
3    3 Jahre  74.33  74.46  74.66  74.85  74.99  75.15  75.2

In [31]:
# Filter for the row where the first column (age group) is "65 Jahre"
# The first column should be "Unnamed: 0"
age_col = df_cleaned.columns[0]
df_65 = df_cleaned[df_cleaned[age_col].astype(str).str.strip() == '65 Jahre'].copy()

print(f"\nFiltered for '65 Jahre': {df_65.shape}")
print("\nData for 65 Jahre:")
print(df_65)


Filtered for '65 Jahre': (1, 64)

Data for 65 Jahre:
   Unnamed: 0   1960   1961   1962   1963   1964  1965   1966   1967  1968  \
65   65 Jahre  18.22  18.26  18.29  18.33  18.36  18.4  18.43  18.47  18.5   

    ...   2014   2015   2016   2017   2018   2019   2020   2021   2022   2023  
65  ...  19.25  19.25  19.25  19.25  19.25  19.25  19.25  19.25  19.25  19.25  

[1 rows x 64 columns]


In [32]:
if len(df_65) == 0:
    print("\nWARNING: No row found with '65 Jahre'. Available age values:")
    print(df_cleaned[age_col].unique()[:20])  # Show first 20 values
else:
    # Drop the age column and transpose to long format
    df_65 = df_65.drop(columns=[age_col])

    # Melt to long format
    df_long = df_65.melt(var_name='year', value_name='remaining_life_expectancy')

    # Convert year to integer
    df_long['year'] = pd.to_numeric(df_long['year'], errors='coerce').astype('Int64')

    # Convert remaining life expectancy to numeric
    df_long['remaining_life_expectancy'] = pd.to_numeric(df_long['remaining_life_expectancy'], errors='coerce')

    # Add 65 to get total life expectancy at age 65
    df_long['life_expectancy'] = df_long['remaining_life_expectancy'] + 65

    # Keep only year and total life expectancy
    df_final = df_long[['year', 'life_expectancy']].copy()

    # Sort by year
    df_final = df_final.sort_values('year').reset_index(drop=True)

    # Remove any rows with missing values
    df_final = df_final.dropna()

    print("\n" + "="*70)
    print("FINAL TRANSFORMED DATA")
    print("="*70)
    print(f"\nShape: {df_final.shape}")
    print("\nFirst 10 rows:")
    print(df_final.head(10))
    print("\nLast 10 rows:")
    print(df_final.tail(10))
    print("\nSummary statistics:")
    print(df_final.describe())

    # Save to CSV with UTF-8 encoding
    df_final.to_csv('life_expectancy_male_65.csv', index=False, encoding='utf-8-sig')
    print("\n✓ Saved to: life_expectancy_male_65.csv")


FINAL TRANSFORMED DATA

Shape: (63, 2)

First 10 rows:
   year  life_expectancy
0  1960            83.22
1  1961            83.26
2  1962            83.29
3  1963            83.33
4  1964            83.36
5  1965            83.40
6  1966            83.43
7  1967            83.47
8  1968            83.50
9  1969            83.53

Last 10 rows:
    year  life_expectancy
53  2014            84.25
54  2015            84.25
55  2016            84.25
56  2017            84.25
57  2018            84.25
58  2019            84.25
59  2020            84.25
60  2021            84.25
61  2022            84.25
62  2023            84.25

Summary statistics:
              year  life_expectancy
count         63.0        63.000000
mean   1991.634921        83.982063
std      18.736975         0.328089
min         1960.0        83.220000
25%         1975.5        83.755000
50%         1992.0        84.150000
75%         2007.5        84.250000
max         2023.0        84.250000

✓ Saved to: life_expec