In [1]:
import pandas as pd
import numpy as np
import os

file_in = "../data/raw/health/mortality-from-ncds-sdgs.csv"
file_out = "../data/interim/mortality_ncds_clean.csv"

In [2]:
# rename columns
df = pd.read_csv(file_in)
df.columns = ['country', 'iso_code', 'year', 'ncd_mortality']

# convert data types
df['year'] = df['year'].astype(int)
df['ncd_mortality'] = pd.to_numeric(df['ncd_mortality'], errors='coerce')

# quick preview
df.head()


Unnamed: 0,country,iso_code,year,ncd_mortality
0,Afghanistan,AFG,2000,43.2
1,Afghanistan,AFG,2001,43.5
2,Afghanistan,AFG,2002,43.1
3,Afghanistan,AFG,2003,42.5
4,Afghanistan,AFG,2004,42.3


In [3]:
# check for missing values
print("Missing values (%):")
print((df.isna().mean() * 100).round(2))

# summary of numeric values
print("\nSummary statistics:")
print(df['ncd_mortality'].describe())

# look for possible invalid or extreme values
outliers = df[(df['ncd_mortality'] < 0) | (df['ncd_mortality'] > 100)]
print(f"\nUnusual entries found: {outliers.shape[0]}")
outliers.head()


Missing values (%):
country          0.00
iso_code         6.06
year             0.00
ncd_mortality    0.00
dtype: float64

Summary statistics:
count    4356.000000
mean       21.054538
std         7.291540
min         6.900000
25%        15.600000
50%        20.950000
75%        25.300000
max        45.300000
Name: ncd_mortality, dtype: float64

Unusual entries found: 0


Unnamed: 0,country,iso_code,year,ncd_mortality


In [4]:
# drop rows where iso_code is missing (regional/income groups)
df = df.dropna(subset=['iso_code']).reset_index(drop=True)

# confirm cleanup
print("Remaining missing iso_code entries:", df['iso_code'].isna().sum())
print("Dataset shape after cleanup:", df.shape)


Remaining missing iso_code entries: 0
Dataset shape after cleanup: (4092, 4)


In [5]:
# check missing values before interpolation
print("Missing values before interpolation:", df['ncd_mortality'].isna().sum())

# interpolate linearly within each country timeline
df['ncd_mortality'] = (
    df.groupby('iso_code', group_keys=False)['ncd_mortality']
      .transform(lambda g: g.interpolate(method='linear', limit_direction='both'))
)

# verify missing values after interpolation
print("Missing values after interpolation:", df['ncd_mortality'].isna().sum())


Missing values before interpolation: 0
Missing values after interpolation: 0


In [6]:
# round mortality values to 2 decimal places
df['ncd_mortality'] = df['ncd_mortality'].round(2)

# sort data neatly
df = df.sort_values(['iso_code', 'year']).reset_index(drop=True)

# save to interim folder
os.makedirs("../data/interim", exist_ok=True)
df.to_csv("../data/interim/mortality_ncds_clean.csv", index=False)

print("Cleaned file saved as '../data/interim/mortality_ncds_clean.csv'")


Cleaned file saved as '../data/interim/mortality_ncds_clean.csv'
