In [39]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [40]:
# Get the current working directory (where your notebook is running)
NOTEBOOK_DIR = Path(os.getcwd())

# Navigate to your dataset subfolder
DATA_DIR = NOTEBOOK_DIR.parent /"raw" / "Disease_and_Mobidity_data"

FILE_NAME = "child_mortality_indicators_nga.csv"
FILE_PATH = DATA_DIR / FILE_NAME

#print("Looking for file at:", FILE_PATH)

print(DATA_DIR.exists())   # Should be True

# Load dataset
df = pd.read_csv(FILE_PATH)

# === 1. Check basic info ===
print("✅ Dataset loaded successfully!")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📋 Column names:")
print(df.columns.tolist())

True
✅ Dataset loaded successfully!
Number of rows: 4702
Number of columns: 17

📋 Column names:
['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'YEAR (DISPLAY)', 'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 'REGION (DISPLAY)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)', 'DIMENSION (CODE)', 'DIMENSION (NAME)', 'Numeric', 'Value', 'Low', 'High']


In [41]:
display(df.head())

Unnamed: 0,GHO (CODE),GHO (DISPLAY),GHO (URL),YEAR (DISPLAY),STARTYEAR,ENDYEAR,REGION (CODE),REGION (DISPLAY),COUNTRY (CODE),COUNTRY (DISPLAY),DIMENSION (TYPE),DIMENSION (CODE),DIMENSION (NAME),Numeric,Value,Low,High
0,#indicator+code,#indicator+name,#indicator+url,#date+year,#date+year+start,#date+year+end,#region+code,#region+name,#country+code,#country+name,#dimension+type,#dimension+code,#dimension+name,#indicator+value+num,#indicator+value,#indicator+value+low,#indicator+value+high
1,CM_01,Number of under-five deaths,https://www.who.int/data/gho/data/indicators/i...,2002,2002,2002,AFR,Africa,NGA,Nigeria,SEX,SEX_BTSX,Both sexes,204862.0,204 862 [190 042-221 563],190042.0,221563.0
2,MORT_300,Distribution of causes of death among children...,https://www.who.int/data/gho/data/indicators/i...,2009,2009,2009,AFR,Africa,NGA,Nigeria,AGEGROUP,AGEGROUP_MONTHS1-59,1-59 months,0.00013,0,,
3,DEATHADO,Number of deaths among adolescents (10 to 19 y...,https://www.who.int/data/gho/data/indicators/i...,2019,2019,2019,AFR,Africa,NGA,Nigeria,SEX,SEX_MLE,Male,20483.0,20 483 [14 395-28 753],14395.0,28753.0
4,MORT_100,"Number of deaths in children aged <5 years, by...",https://www.who.int/data/gho/data/indicators/i...,2006,2006,2006,AFR,Africa,NGA,Nigeria,SEX,SEX_BTSX,Both sexes,42520.70405,42520.7,,


In [42]:
# --- Drop the first row ---
df = df.drop(index=0).reset_index(drop=True)

In [43]:
# Rename columns (matching lowercase names)
# 🧹 Rename columns to clear, Python-friendly names
df.columns = [
    "indicator_code", "indicator_name", "indicator_url", "year",
    "start_year", "end_year", "region_code", "region_name",
    "country_code", "country_name", "dimension_type", "dimension_code",
    "dimension_name", "numeric", "value", "lower_bound", "upper_bound"
]

# ✅ Verify the rename
print("✅ Columns renamed successfully:")
print(df.columns.tolist())

# Optional — quick look at first few rows
display(df.head())

✅ Columns renamed successfully:
['indicator_code', 'indicator_name', 'indicator_url', 'year', 'start_year', 'end_year', 'region_code', 'region_name', 'country_code', 'country_name', 'dimension_type', 'dimension_code', 'dimension_name', 'numeric', 'value', 'lower_bound', 'upper_bound']


Unnamed: 0,indicator_code,indicator_name,indicator_url,year,start_year,end_year,region_code,region_name,country_code,country_name,dimension_type,dimension_code,dimension_name,numeric,value,lower_bound,upper_bound
0,CM_01,Number of under-five deaths,https://www.who.int/data/gho/data/indicators/i...,2002,2002,2002,AFR,Africa,NGA,Nigeria,SEX,SEX_BTSX,Both sexes,204862.0,204 862 [190 042-221 563],190042.0,221563.0
1,MORT_300,Distribution of causes of death among children...,https://www.who.int/data/gho/data/indicators/i...,2009,2009,2009,AFR,Africa,NGA,Nigeria,AGEGROUP,AGEGROUP_MONTHS1-59,1-59 months,0.00013,0,,
2,DEATHADO,Number of deaths among adolescents (10 to 19 y...,https://www.who.int/data/gho/data/indicators/i...,2019,2019,2019,AFR,Africa,NGA,Nigeria,SEX,SEX_MLE,Male,20483.0,20 483 [14 395-28 753],14395.0,28753.0
3,MORT_100,"Number of deaths in children aged <5 years, by...",https://www.who.int/data/gho/data/indicators/i...,2006,2006,2006,AFR,Africa,NGA,Nigeria,SEX,SEX_BTSX,Both sexes,42520.70405,42520.7,,
4,CHILDMORT5TO14,Mortality rate for 5-14 year-olds (probability...,https://www.who.int/data/gho/data/indicators/i...,2016,2016,2016,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,20.68359999,20.7 [17.0-25.4],16.977280414,25.410259867


In [44]:
# ---Check distinct countries by code and name ---
unique_countries = df[['country_code', 'country_name']].drop_duplicates()
print(f"Distinct countries found: {len(unique_countries)}")
print(unique_countries.head())

Distinct countries found: 1
  country_code country_name
0          NGA      Nigeria


In [45]:
# ---Check for missing values in all columns ---
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])


Missing values per column:
numeric          44
value            44
lower_bound    2502
upper_bound    2502
dtype: int64


In [46]:
display(df_cleaned.head())

Unnamed: 0,indicator_code,indicator_name,year,start_year,end_year,dimension_type,dimension_code,dimension_name,numeric,value
0,CM_01,Number of under-five deaths,2002,2002,2002,Sex,BTSX,Both sexes,204862.0,204 862 [190 042-221 563]
1,MORT_300,Distribution of causes of death among children...,2009,2009,2009,Sex,AGEGROUP_MONTHS1-59,,0.00013,0
2,DEATHADO,Number of deaths among adolescents (10 to 19 y...,2019,2019,2019,Sex,MLE,Male,20483.0,20 483 [14 395-28 753]
3,MORT_100,"Number of deaths in children aged <5 years, by...",2006,2006,2006,Sex,BTSX,Both sexes,42520.70405,42520.7
4,CHILDMORT5TO14,Mortality rate for 5-14 year-olds (probability...,2016,2016,2016,Sex,FMLE,Female,20.68359999,20.7 [17.0-25.4]


In [47]:
missing_rows = df_cleaned[
    df_cleaned[['numeric', 'value', 'lower_bound', 'upper_bound']].isnull().all(axis=1)
]

print(f"Rows where all four columns are missing: {len(missing_rows)}")
display(missing_rows.head())


KeyError: "['lower_bound', 'upper_bound'] not in index"

In [86]:
df.dropna(subset=['numeric', 'value', 'lower_bound', 'upper_bound'], how='all', inplace=True)


In [20]:
df_cleaned = df_cleaned[
    ~df_cleaned[['numeric', 'value', 'lower_bound', 'upper_bound']].isnull().all(axis=1)
]

print(f"✅ Remaining rows after dropping: {len(df_cleaned)}")


✅ Remaining rows after dropping: 4657


In [48]:
# ---Drop unwanted columns ---
columns_to_drop = ['indicator_url', 'region_code', 'region_name', 'country_code', 'country_name', 'lower_bound', 'upper_bound']
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print("\nColumns after dropping:")
print(df_cleaned.columns)


Columns after dropping:
Index(['indicator_code', 'indicator_name', 'year', 'start_year', 'end_year',
       'dimension_type', 'dimension_code', 'dimension_name', 'numeric',
       'value'],
      dtype='object')


In [22]:
# Filter rows where any of the three columns are NOT null
dim_subset = df_cleaned[
    df_cleaned[['dimension_code', 'dimension_name', 'dimension_type']].notnull().any(axis=1)
]

print(f"✅ Rows with dimension-related data: {len(dim_subset)}")
display(dim_subset[['dimension_code', 'dimension_name', 'dimension_type']].head(10))


✅ Rows with dimension-related data: 4701


Unnamed: 0,dimension_code,dimension_name,dimension_type
0,SEX_BTSX,Both sexes,SEX
1,AGEGROUP_MONTHS1-59,1-59 months,AGEGROUP
2,SEX_MLE,Male,SEX
3,SEX_BTSX,Both sexes,SEX
4,SEX_FMLE,Female,SEX
5,SEX_MLE,Male,SEX
6,AGEGROUP_YEARS0-4,0-4 years of age,AGEGROUP
7,SEX_FMLE,Female,SEX
8,AGEGROUP_DAYS0-27,0-27 days,AGEGROUP
9,AGEGROUP_DAYS0-27,0-27 days,AGEGROUP


In [49]:
# ---Check for missing values in all columns ---
missing_values = df_cleaned.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])


Missing values per column:
numeric    44
value      44
dtype: int64


In [50]:
# Convert to numeric (handles strings safely)
df['numeric'] = pd.to_numeric(df['numeric'], errors='coerce')
df['value'] = pd.to_numeric(df['value'], errors='coerce')
#df['lower_bound'] = pd.to_numeric(df['lower_bound'], errors='coerce')
#df['upper_bound'] = pd.to_numeric(df['upper_bound'], errors='coerce')

In [51]:
df_cleaned['numeric'].fillna(df['numeric'].mean(), inplace=True)
df_cleaned['value'].fillna(df['value'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['numeric'].fillna(df['numeric'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['value'].fillna(df['value'].mean(), inplace=True)


In [52]:
display(df_cleaned.head())

Unnamed: 0,indicator_code,indicator_name,year,start_year,end_year,dimension_type,dimension_code,dimension_name,numeric,value
0,CM_01,Number of under-five deaths,2002,2002,2002,SEX,SEX_BTSX,Both sexes,204862.0,204 862 [190 042-221 563]
1,MORT_300,Distribution of causes of death among children...,2009,2009,2009,AGEGROUP,AGEGROUP_MONTHS1-59,1-59 months,0.00013,0
2,DEATHADO,Number of deaths among adolescents (10 to 19 y...,2019,2019,2019,SEX,SEX_MLE,Male,20483.0,20 483 [14 395-28 753]
3,MORT_100,"Number of deaths in children aged <5 years, by...",2006,2006,2006,SEX,SEX_BTSX,Both sexes,42520.70405,42520.7
4,CHILDMORT5TO14,Mortality rate for 5-14 year-olds (probability...,2016,2016,2016,SEX,SEX_FMLE,Female,20.68359999,20.7 [17.0-25.4]


In [53]:
print(df_cleaned['dimension_code'].unique())

['SEX_BTSX' 'AGEGROUP_MONTHS1-59' 'SEX_MLE' 'SEX_FMLE' 'AGEGROUP_YEARS0-4'
 'AGEGROUP_DAYS0-27']


In [29]:
# Replace long-form values with shorter codes
df_cleaned['dimension_code'] = df_cleaned['dimension_code'].replace({
    'SEX_BTSX': 'BTSX',
    'SEX_MLE': 'MLE',
    'SEX_FMLE': 'FMLE'
})

# Confirm it worked
print(df_cleaned['dimension_code'].unique())

['BTSX' 'AGEGROUP_MONTHS1-59' 'MLE' 'FMLE' 'AGEGROUP_YEARS0-4'
 'AGEGROUP_DAYS0-27']


In [31]:
# define mapping
sex_mapping = {
    'BTSX': 'Both sexes',
    'MLE': 'Male',
    'FMLE': 'Female'
}
codes = list(sex_mapping.keys())

# check if all values are NaN
if df_cleaned['dimension_code'].isna().all():
    # Case 2: All missing — fill everything with BTSX
    df_cleaned['dimension_code'] = 'BTSX'
    df_cleaned['dimension_name'] = 'Both sexes'
    df_cleaned['dimension_type'] = 'Sex'
    print("⚙️ All values missing — filled with SEX_BTSX (Both sexes).")

else:
    # Case 1: Some missing — fill randomly
    np.random.seed(42)  # reproducible random assignment
    mask_missing = df_cleaned['dimension_code'].isna()
    df_cleaned.loc[mask_missing, 'dimension_code'] = np.random.choice(codes, size=mask_missing.sum())
    
    # Map names and type
    df_cleaned['dimension_name'] = df_cleaned['dimension_code'].map(sex_mapping)
    df_cleaned['dimension_type'] = 'Sex'
    print("✅ Randomly filled missing values with consistent sex codes and names.")

# Verify
display(df_cleaned[['dimension_code', 'dimension_type', 'dimension_name']].drop_duplicates())


✅ Randomly filled missing values with consistent sex codes and names.


Unnamed: 0,dimension_code,dimension_type,dimension_name
0,BTSX,Sex,Both sexes
1,AGEGROUP_MONTHS1-59,Sex,
2,MLE,Sex,Male
4,FMLE,Sex,Female
6,AGEGROUP_YEARS0-4,Sex,
8,AGEGROUP_DAYS0-27,Sex,


In [54]:
display(df_cleaned.head())

Unnamed: 0,indicator_code,indicator_name,year,start_year,end_year,dimension_type,dimension_code,dimension_name,numeric,value
0,CM_01,Number of under-five deaths,2002,2002,2002,SEX,SEX_BTSX,Both sexes,204862.0,204 862 [190 042-221 563]
1,MORT_300,Distribution of causes of death among children...,2009,2009,2009,AGEGROUP,AGEGROUP_MONTHS1-59,1-59 months,0.00013,0
2,DEATHADO,Number of deaths among adolescents (10 to 19 y...,2019,2019,2019,SEX,SEX_MLE,Male,20483.0,20 483 [14 395-28 753]
3,MORT_100,"Number of deaths in children aged <5 years, by...",2006,2006,2006,SEX,SEX_BTSX,Both sexes,42520.70405,42520.7
4,CHILDMORT5TO14,Mortality rate for 5-14 year-olds (probability...,2016,2016,2016,SEX,SEX_FMLE,Female,20.68359999,20.7 [17.0-25.4]


In [55]:
# ---Check for missing values in all columns ---
missing_values = df_cleaned.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])


Missing values per column:
Series([], dtype: int64)


In [56]:
# Define output file path
output_path = NOTEBOOK_DIR.parent / "processed" /"disease_indicators" /"child_mortality_indicators_nga.csv"

# Save to CSV
df_cleaned.to_csv(output_path, index=False)

print(f"Filtered data saved to: {output_path}")
print(f"Number of rows left: {len(df_cleaned)}")

Filtered data saved to: C:\Users\USER\Desktop\WORK\RenewedCare\data\processed\disease_indicators\child_mortality_indicators_nga.csv
Number of rows left: 4701
