In [1]:
import pandas as pd
import numpy as np
import os

file_in = "../data/raw/health/infant-mortality.csv"
file_out = "../data/interim/infant_mortality_clean.csv"

In [2]:
# load dataset
df = pd.read_csv(file_in)

# check structure and preview
print("Initial shape:", df.shape)
df.head()

# inspect columns and data types
df.info()

# check for missing values
df.isna().sum()


Initial shape: (13577, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13577 entries, 0 to 13576
Data columns (total 4 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Entity                                                                    13577 non-null  object 
 1   Code                                                                      13001 non-null  object 
 2   Year                                                                      13577 non-null  int64  
 3   Infant mortality rate of babies aged under one year, per 100 live births  13577 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 424.4+ KB


Entity                                                                        0
Code                                                                        576
Year                                                                          0
Infant mortality rate of babies aged under one year, per 100 live births      0
dtype: int64

In [3]:
# standardize column names
df = df.rename(columns={
    'Entity': 'country',
    'Code': 'iso_code',
    'Year': 'year',
    df.columns[-1]: 'infant_mortality_rate'  # last column should be the mortality rate
})

# drop regional aggregates (rows without ISO code)
df = df[df['iso_code'].notna()]

# limit the time range to match other datasets
df = df[df['year'].between(1990, 2022)]

print("After filtering:", df.shape)
df.head()


After filtering: (6633, 4)


Unnamed: 0,country,iso_code,year,infant_mortality_rate
33,Afghanistan,AFG,1990,14.451245
34,Afghanistan,AFG,1991,14.010447
35,Afghanistan,AFG,1992,13.589371
36,Afghanistan,AFG,1993,13.199313
37,Afghanistan,AFG,1994,12.831206


In [4]:
# convert to numeric (in case some values are strings or contain commas)
df['infant_mortality_rate'] = pd.to_numeric(df['infant_mortality_rate'], errors='coerce')

# replace impossible or invalid values
df.loc[df['infant_mortality_rate'] <= 0, 'infant_mortality_rate'] = np.nan

# check range and missingness
print(df['infant_mortality_rate'].describe())
print("\nMissing values:", df['infant_mortality_rate'].isna().sum())


count    6633.000000
mean        3.219120
std         3.102527
min         0.141364
25%         0.865503
50%         2.007520
75%         4.705395
max        27.403477
Name: infant_mortality_rate, dtype: float64

Missing values: 0


In [5]:
# round the infant mortality rate to 2 decimal places
df['infant_mortality_rate'] = df['infant_mortality_rate'].round(2)

# quick check
df[['country', 'year', 'infant_mortality_rate']].head()


Unnamed: 0,country,year,infant_mortality_rate
33,Afghanistan,1990,14.45
34,Afghanistan,1991,14.01
35,Afghanistan,1992,13.59
36,Afghanistan,1993,13.2
37,Afghanistan,1994,12.83


In [6]:
# sort for consistent interpolation
df = df.sort_values(['iso_code', 'year']).reset_index(drop=True)

# interpolate linearly within each country
df['infant_mortality_rate'] = (
    df.groupby('iso_code', group_keys=False)['infant_mortality_rate']
      .transform(lambda g: g.interpolate(method='linear', limit_direction='both'))
)

# check remaining missing values
print("Remaining missing values:", df['infant_mortality_rate'].isna().sum())


Remaining missing values: 0


In [8]:
# convert from 'per 100 live births' to 'per 1,000 live births'
df['infant_mortality_rate'] = df['infant_mortality_rate'] * 10

# quick sanity check
print(df['infant_mortality_rate'].describe())


count    6633.000000
mean       32.191075
std        31.025687
min         1.400000
25%         8.700000
50%        20.100000
75%        47.100000
max       274.000000
Name: infant_mortality_rate, dtype: float64


In [9]:
# basic descriptive stats
print(df['infant_mortality_rate'].describe())

# look for suspiciously high or low values
unusual = df[(df['infant_mortality_rate'] < 1) | (df['infant_mortality_rate'] > 300)]
print(f"Unusual entries found: {unusual.shape[0]}")
unusual.head()


count    6633.000000
mean       32.191075
std        31.025687
min         1.400000
25%         8.700000
50%        20.100000
75%        47.100000
max       274.000000
Name: infant_mortality_rate, dtype: float64
Unusual entries found: 0


Unnamed: 0,country,iso_code,year,infant_mortality_rate


In [14]:
# export cleaned dataset
df.to_csv("../data/interim/infant_mortality_clean.csv", index=False)

print("Cleaned file saved as '../data/interim/infant_mortality_clean.csv'")


Cleaned file saved as '../data/interim/infant_mortality_clean.csv'
