In [1]:
import pandas as pd

# load dataset
df = pd.read_csv("../data/raw/health/life-expectancy.csv")

# preview the data
df.head()


Unnamed: 0,Entity,Code,Year,Period life expectancy at birth
0,Afghanistan,AFG,1950,28.1563
1,Afghanistan,AFG,1951,28.5836
2,Afghanistan,AFG,1952,29.0138
3,Afghanistan,AFG,1953,29.4521
4,Afghanistan,AFG,1954,29.6975


In [3]:
# rename columns
df.columns = ['country', 'iso_code', 'year', 'life_expectancy']

# ensure correct data types
df['year'] = df['year'].astype(int)
df['life_expectancy'] = pd.to_numeric(df['life_expectancy'], errors='coerce')

# preview
df.head()


Unnamed: 0,country,iso_code,year,life_expectancy
0,Afghanistan,AFG,1950,28.1563
1,Afghanistan,AFG,1951,28.5836
2,Afghanistan,AFG,1952,29.0138
3,Afghanistan,AFG,1953,29.4521
4,Afghanistan,AFG,1954,29.6975


In [4]:
# missing value check
print(" Missing values (%):")
print((df.isna().mean() * 100).round(2))

# descriptive statistics
print("\nSummary statistics:")
print(df['life_expectancy'].describe())

# find outliers or invalid entries
outliers = df[(df['life_expectancy'] < 20) | (df['life_expectancy'] > 100)]
print(f"\nUnusual values found: {outliers.shape[0]}")
outliers.head()


 Missing values (%):
country            0.00
iso_code           9.07
year               0.00
life_expectancy    0.00
dtype: float64

Summary statistics:
count    21565.000000
mean        61.942238
std         12.925909
min         10.989100
25%         52.703000
50%         64.479900
75%         71.978900
max         86.372400
Name: life_expectancy, dtype: float64

Unusual values found: 18


Unnamed: 0,country,iso_code,year,life_expectancy
2969,Cambodia,KHM,1975,12.7845
2970,Cambodia,KHM,1976,11.6323
2971,Cambodia,KHM,1977,11.2952
2972,Cambodia,KHM,1978,11.5726
3412,Central African Republic,CAF,2009,14.6655


In [7]:
# fill missing iso codes using the most frequent code for each country
df['iso_code'] = df.groupby('country')['iso_code'].transform(lambda x: x.ffill().bfill())

# recheck missing iso codes
missing_iso = df['iso_code'].isna().sum()
print(f" Remaining missing ISO codes: {missing_iso}")

# round life expectancy to 2 decimal places
df['life_expectancy'] = df['life_expectancy'].round(2)

# sort by country and year for clean order
df = df.sort_values(['iso_code', 'year']).reset_index(drop=True)

# quick check
df.head()


 Remaining missing ISO codes: 1956


  df['iso_code'] = df.groupby('country')['iso_code'].transform(lambda x: x.ffill().bfill())


Unnamed: 0,country,iso_code,year,life_expectancy
0,Aruba,ABW,1950,57.99
1,Aruba,ABW,1951,58.73
2,Aruba,ABW,1952,59.45
3,Aruba,ABW,1953,60.12
4,Aruba,ABW,1954,60.82


In [None]:
# check initial missingness
print(" Missing values before interpolation:", df['life_expectancy'].isna().sum())

# interpolate within each country
df['life_expectancy'] = (
    df.groupby('iso_code', group_keys=False)['life_expectancy']
      .transform(lambda g: g.interpolate(method='linear', limit_direction='both'))
)

# verify again
print(" Missing values after interpolation:", df['life_expectancy'].isna().sum())


 Missing values before interpolation: 0
 Missing values after interpolation: 1956


In [17]:
# find rows where iso_code is missing or invalid
missing_iso_rows = df[df['iso_code'].isna()]

print("Rows with missing iso_code after interpolation:")
print(missing_iso_rows['country'].unique())

print(f"\nTotal rows affected: {missing_iso_rows.shape[0]}")


Rows with missing iso_code after interpolation:
[]

Total rows affected: 0


In [12]:
# remove rows without ISO codes (regional aggregates)
df = df.dropna(subset=['iso_code']).reset_index(drop=True)

# confirm
print("Remaining missing iso_code entries:", df['iso_code'].isna().sum())
print("New dataset shape:", df.shape)


Remaining missing iso_code entries: 0
New dataset shape: (19609, 4)


In [16]:
# find rows where iso_code is missing or invalid
missing_iso_rows = df[df['iso_code'].isna()]

print("Rows with missing iso_code after interpolation:")
print(missing_iso_rows['country'].unique())

print(f"\nTotal rows affected: {missing_iso_rows.shape[0]}")

Rows with missing iso_code after interpolation:
[]

Total rows affected: 0


In [15]:
# quick validation summary
print(df['life_expectancy'].describe())

# look for any invalid or extreme values
invalid = df[(df['life_expectancy'] < 20) | (df['life_expectancy'] > 100)]
print(f"Unusual life expectancy entries: {invalid.shape[0]}")

# save cleaned dataset
import os
os.makedirs("../data/interim", exist_ok=True)
df.to_csv("../data/interim/life_expectancy_clean.csv", index=False)

print("Cleaned file saved as '../data/interim/life_expectancy_clean.csv'")


count    19609.000000
mean        62.090270
std         13.052871
min         10.990000
25%         52.950000
50%         64.810000
75%         72.160000
max         86.370000
Name: life_expectancy, dtype: float64
Unusual life expectancy entries: 18
Cleaned file saved as '../data/interim/life_expectancy_clean.csv'
