In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [40]:
# Get the current working directory (where your notebook is running)
NOTEBOOK_DIR = Path(os.getcwd())

# Navigate to your dataset subfolder
DATA_DIR = NOTEBOOK_DIR.parent /"raw" / "Disease_and_Mobidity_data"

FILE_NAME = "maternal_and_reproductive_health_indicators_nga.csv"
FILE_PATH = DATA_DIR / FILE_NAME

#print("Looking for file at:", FILE_PATH)

print(DATA_DIR.exists())   # Should be True

# Load dataset
df = pd.read_csv(FILE_PATH)

# === 1. Check basic info ===
print("✅ Dataset loaded successfully!")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📋 Column names:")
print(df.columns.tolist())

True
✅ Dataset loaded successfully!
Number of rows: 214
Number of columns: 17

📋 Column names:
['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'YEAR (DISPLAY)', 'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 'REGION (DISPLAY)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)', 'DIMENSION (CODE)', 'DIMENSION (NAME)', 'Numeric', 'Value', 'Low', 'High']


In [41]:
display(df.head())

Unnamed: 0,GHO (CODE),GHO (DISPLAY),GHO (URL),YEAR (DISPLAY),STARTYEAR,ENDYEAR,REGION (CODE),REGION (DISPLAY),COUNTRY (CODE),COUNTRY (DISPLAY),DIMENSION (TYPE),DIMENSION (CODE),DIMENSION (NAME),Numeric,Value,Low,High
0,#indicator+code,#indicator+name,#indicator+url,#date+year,#date+year+start,#date+year+end,#region+code,#region+name,#country+code,#country+name,#dimension+type,#dimension+code,#dimension+name,#indicator+value+num,#indicator+value,#indicator+value+low,#indicator+value+high
1,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_NUM,Number of women of reproductive age (aged 15-4...,https://www.who.int/data/gho/data/indicators/i...,2004,2004,2004,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,2086.2,2086.2 [1565.9-2579.7],1565.9,2579.7
2,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,https://www.who.int/data/gho/data/indicators/i...,2017,2017,2017,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,40.7,40.7 [30.7-51.0],30.7,51.0
3,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,https://www.who.int/data/gho/data/indicators/i...,2015,2015,2015,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,41.8,41.8 [30.9-53.4],30.9,53.4
4,MDG_0000000003,Adolescent birth rate (per 1000 women),https://www.who.int/data/gho/data/indicators/i...,2003,2003,2003,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,152.0,152.0,,


In [42]:
# --- Drop the first row ---
df = df.drop(index=0).reset_index(drop=True)

In [43]:
# Rename columns (matching lowercase names)
# 🧹 Rename columns to clear, Python-friendly names
df.columns = [
    "indicator_code", "indicator_name", "indicator_url", "year",
    "start_year", "end_year", "region_code", "region_name",
    "country_code", "country_name", "sex_type", "sex_code",
    "sex_name", "numeric", "value", "lower_bound", "upper_bound"
]

# ✅ Verify the rename
print("✅ Columns renamed successfully:")
print(df.columns.tolist())

# Optional — quick look at first few rows
display(df.head())

✅ Columns renamed successfully:
['indicator_code', 'indicator_name', 'indicator_url', 'year', 'start_year', 'end_year', 'region_code', 'region_name', 'country_code', 'country_name', 'sex_type', 'sex_code', 'sex_name', 'numeric', 'value', 'lower_bound', 'upper_bound']


Unnamed: 0,indicator_code,indicator_name,indicator_url,year,start_year,end_year,region_code,region_name,country_code,country_name,sex_type,sex_code,sex_name,numeric,value,lower_bound,upper_bound
0,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_NUM,Number of women of reproductive age (aged 15-4...,https://www.who.int/data/gho/data/indicators/i...,2004,2004,2004,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,2086.2,2086.2 [1565.9-2579.7],1565.9,2579.7
1,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,https://www.who.int/data/gho/data/indicators/i...,2017,2017,2017,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,40.7,40.7 [30.7-51.0],30.7,51.0
2,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,https://www.who.int/data/gho/data/indicators/i...,2015,2015,2015,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,41.8,41.8 [30.9-53.4],30.9,53.4
3,MDG_0000000003,Adolescent birth rate (per 1000 women),https://www.who.int/data/gho/data/indicators/i...,2003,2003,2003,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,152.0,152.0,,
4,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,https://www.who.int/data/gho/data/indicators/i...,2004,2004,2004,AFR,Africa,NGA,Nigeria,SEX,SEX_FMLE,Female,53.8,53.8 [40.4-66.5],40.4,66.5


In [44]:
# ---Check distinct countries by code and name ---
unique_countries = df[['country_code', 'country_name']].drop_duplicates()
print(f"Distinct countries found: {len(unique_countries)}")
print(unique_countries.head())

Distinct countries found: 1
  country_code country_name
0          NGA      Nigeria


In [45]:
# ---Check for missing values in all columns ---
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])


Missing values per column:
sex_type       31
sex_code       31
sex_name       31
lower_bound    69
upper_bound    69
dtype: int64


In [46]:
# ---Drop unwanted columns ---
columns_to_drop = ['indicator_url', 'region_code', 'region_name', 'country_code', 'country_name']
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print("\nColumns after dropping:")
print(df_cleaned.columns)


Columns after dropping:
Index(['indicator_code', 'indicator_name', 'year', 'start_year', 'end_year',
       'sex_type', 'sex_code', 'sex_name', 'numeric', 'value', 'lower_bound',
       'upper_bound'],
      dtype='object')


In [47]:
# Filter rows where any of the three columns are NOT null
dim_subset = df_cleaned[
    df_cleaned[['sex_code', 'sex_name', 'sex_type']].notnull().any(axis=1)
]

print(f"✅ Rows with dimension-related data: {len(dim_subset)}")
display(dim_subset[['sex_code', 'sex_name', 'sex_type']].head(10))


✅ Rows with dimension-related data: 182


Unnamed: 0,sex_code,sex_name,sex_type
0,SEX_FMLE,Female,SEX
1,SEX_FMLE,Female,SEX
2,SEX_FMLE,Female,SEX
3,SEX_FMLE,Female,SEX
4,SEX_FMLE,Female,SEX
6,SEX_FMLE,Female,SEX
7,SEX_FMLE,Female,SEX
8,SEX_FMLE,Female,SEX
9,SEX_FMLE,Female,SEX
10,SEX_FMLE,Female,SEX


In [48]:
# Convert to numeric (handles strings safely)
df['numeric'] = pd.to_numeric(df['numeric'], errors='coerce')
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df['lower_bound'] = pd.to_numeric(df['lower_bound'], errors='coerce')
df['upper_bound'] = pd.to_numeric(df['upper_bound'], errors='coerce')

In [49]:
df_cleaned['lower_bound'].fillna(df['lower_bound'].mean(), inplace=True)
df_cleaned['upper_bound'].fillna(df['upper_bound'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['lower_bound'].fillna(df['lower_bound'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['upper_bound'].fillna(df['upper_bound'].mean(), inplace=True)


In [50]:
display(df_cleaned.head())

Unnamed: 0,indicator_code,indicator_name,year,start_year,end_year,sex_type,sex_code,sex_name,numeric,value,lower_bound,upper_bound
0,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_NUM,Number of women of reproductive age (aged 15-4...,2004,2004,2004,SEX,SEX_FMLE,Female,2086.2,2086.2 [1565.9-2579.7],1565.9,2579.7
1,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2017,2017,2017,SEX,SEX_FMLE,Female,40.7,40.7 [30.7-51.0],30.7,51.0
2,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2015,2015,2015,SEX,SEX_FMLE,Female,41.8,41.8 [30.9-53.4],30.9,53.4
3,MDG_0000000003,Adolescent birth rate (per 1000 women),2003,2003,2003,SEX,SEX_FMLE,Female,152.0,152.0,4176.167361,7583.347917
4,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2004,2004,2004,SEX,SEX_FMLE,Female,53.8,53.8 [40.4-66.5],40.4,66.5


In [51]:
# Replace long-form values with shorter codes
df_cleaned['sex_code'] = df_cleaned['sex_code'].replace({
    'SEX_BTSX': 'BTSX',
    'SEX_MLE': 'MLE',
    'SEX_FMLE': 'FMLE'
})

# Confirm it worked
print(df_cleaned['sex_code'].unique())

['FMLE' nan]


In [39]:
# define mapping
sex_mapping = {
    'BTSX': 'Both sexes',
    'MLE': 'Male',
    'FMLE': 'Female'
}
codes = list(sex_mapping.keys())

# check if all values are NaN
if df_cleaned['sex_code'].isna().all():
    # Case 2: All missing — fill everything with BTSX
    df_cleaned['sex_code'] = 'FMLE'
    df_cleaned['sex_name'] = 'Female'
    df_cleaned['sex_type'] = 'Sex'
    print("⚙️ All values missing — filled with SEX_BTSX (Both sexes).")

else:
    # Case 1: Some missing — fill randomly
    np.random.seed(42)  # reproducible random assignment
    mask_missing = df_cleaned['sex_code'].isna()
    df_cleaned.loc[mask_missing, 'sex_code'] = np.random.choice(codes, size=mask_missing.sum())
    
    # Map names and type
    df_cleaned['sex_name'] = df_cleaned['sex_code'].map(sex_mapping)
    df_cleaned['sex_type'] = 'Sex'
    print("✅ Randomly filled missing values with consistent sex codes and names.")

# Verify
display(df_cleaned[['sex_code', 'sex_type', 'sex_name']].drop_duplicates())


✅ Randomly filled missing values with consistent sex codes and names.


Unnamed: 0,sex_code,sex_type,sex_name
0,FMLE,Sex,Female
22,BTSX,Sex,Both sexes
44,MLE,Sex,Male


In [52]:
display(df_cleaned.head())

Unnamed: 0,indicator_code,indicator_name,year,start_year,end_year,sex_type,sex_code,sex_name,numeric,value,lower_bound,upper_bound
0,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_NUM,Number of women of reproductive age (aged 15-4...,2004,2004,2004,SEX,FMLE,Female,2086.2,2086.2 [1565.9-2579.7],1565.9,2579.7
1,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2017,2017,2017,SEX,FMLE,Female,40.7,40.7 [30.7-51.0],30.7,51.0
2,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2015,2015,2015,SEX,FMLE,Female,41.8,41.8 [30.9-53.4],30.9,53.4
3,MDG_0000000003,Adolescent birth rate (per 1000 women),2003,2003,2003,SEX,FMLE,Female,152.0,152.0,4176.167361,7583.347917
4,NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV,Prevalence of anaemia in women of reproductive...,2004,2004,2004,SEX,FMLE,Female,53.8,53.8 [40.4-66.5],40.4,66.5


In [53]:
mask = df_cleaned[['sex_code', 'sex_type', 'sex_name']].isna().all(axis=1)
df_cleaned.loc[mask, ['sex_code', 'sex_type', 'sex_name']] = ['FMLE', 'Sex', 'Female']


In [54]:
# Replace long-form values with shorter codes
df_cleaned['sex_code'] = df_cleaned['sex_code'].replace({
    'SEX_BTSX': 'BTSX',
    'SEX_MLE': 'MLE',
    'SEX_FMLE': 'FMLE'
})

# Confirm it worked
print(df_cleaned['sex_code'].unique())

['FMLE']


In [55]:
# Define output file path
output_path = NOTEBOOK_DIR.parent / "processed" / "maternal_and_reproductive_health_indicators_nga.csv"

# Save to CSV
df_cleaned.to_csv(output_path, index=False)

print(f"Filtered data saved to: {output_path}")
print(f"Number of rows left: {len(df_cleaned)}")

Filtered data saved to: C:\Users\USER\Desktop\WORK\RenewedCare\data\processed\maternal_and_reproductive_health_indicators_nga.csv
Number of rows left: 213
