### <span style="color:#2E6F40;">**Importing the Dependencies**</span>

In [2]:
import requests
import pandas as pd
import pycountry

### <span style="color:#2E6F40;">**Step 1: 📥 Dataset Overview & Collection**</span>
We are going to collect data from **4 public WHO API endpoints**:

1. **Obesity data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_30C`](https://ghoapi.azureedge.net/api/NCD_BMI_30C)

2. **Obesity data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C)

3. **Underweight data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_18C`](https://ghoapi.azureedge.net/api/NCD_BMI_18C)

4. **Underweight data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C)

### <span style="color:#2E6F40;">**Define function for converting API data into dataframe**</span>

In [5]:
def convert_dataframe(url):
    result = requests.get(url)
    data = result.json()
    df = pd.DataFrame(data['value'])
    return df

### <span style="color:#2E6F40;">**1. Load all 4 datasets into dataframe**</span>

#### <span style="color:#4272FF;">**1. Obesity data for adults**</span>

In [8]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_30C"
obesity_adults = convert_dataframe(url)
obesity_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**2. Obesity data for children**</span>

In [10]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C"
obesity_children = convert_dataframe(url)
obesity_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540919,NCD_BMI_PLUS2C,COUNTRY,CUB,YEAR,AMR,Americas,SEX,2021,SEX_MLE,...,,12.5 [8.5-17.2],12.52724,8.46905,17.19727,,2024-02-29T16:06:41.017+01:00,2021,2021-01-01T00:00:00+01:00,2021-12-31T00:00:00+01:00
1,5540986,NCD_BMI_PLUS2C,COUNTRY,PNG,YEAR,WPR,Western Pacific,SEX,2000,SEX_MLE,...,,8.0 [1.7-19.8],8.01665,1.68665,19.76374,,2024-02-29T16:06:41.017+01:00,2000,2000-01-01T00:00:00+01:00,2000-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**3. Underweight data for adults**</span>

In [12]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_18C"
underweight_adults = convert_dataframe(url)
underweight_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540762,NCD_BMI_18C,COUNTRY,PER,AMR,YEAR,Americas,SEX,2011,SEX_FMLE,...,,1.6 [1.4-1.8],1.63089,1.43569,1.8362,,2024-02-29T16:06:41.017+01:00,2011,2011-01-01T00:00:00+01:00,2011-12-31T00:00:00+01:00
1,5541121,NCD_BMI_18C,COUNTRY,UGA,AFR,YEAR,Africa,SEX,2020,SEX_BTSX,...,,11.3 [9.7-12.9],11.27629,9.70523,12.91453,,2024-02-29T16:06:41.017+01:00,2020,2020-01-01T00:00:00+01:00,2020-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**4. Underweight data for children**</span>

In [14]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C"
underweight_children = convert_dataframe(url)
underweight_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,Dim1,TimeDim,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540609,NCD_BMI_MINUS2C,COUNTRY,IRL,EUR,YEAR,Europe,SEX,SEX_FMLE,1998,...,,0.8 [0.4-1.4],0.83126,0.41813,1.43478,,2024-02-29T16:06:41.017+01:00,1998,1998-01-01T00:00:00+01:00,1998-12-31T00:00:00+01:00
1,5540614,NCD_BMI_MINUS2C,COUNTRY,RUS,EUR,YEAR,Europe,SEX,SEX_FMLE,2010,...,,3.4 [2.6-4.3],3.40803,2.61038,4.28252,,2024-02-29T16:06:41.017+01:00,2010,2010-01-01T00:00:00+01:00,2010-12-31T00:00:00+01:00


### <span style="color:#2E6F40;">**🔄 Preprocessing Steps**</span>

#### <span style="color:#4272FF;">**2. Add a new column <u>*age_group*</u> to distinguish adults and children**</span>

In [17]:
obesity_adults['age_group'] = 'adult'
obesity_children['age_group'] = 'children'
underweight_adults['age_group'] = 'adult'
underweight_children['age_group'] = 'children'

In [18]:
# verify one df
print(obesity_adults.columns)
obesity_adults.head(2)

Index(['Id', 'IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDimType',
       'ParentLocationCode', 'ParentLocation', 'Dim1Type', 'TimeDim', 'Dim1',
       'Dim2Type', 'Dim2', 'Dim3Type', 'Dim3', 'DataSourceDimType',
       'DataSourceDim', 'Value', 'NumericValue', 'Low', 'High', 'Comments',
       'Date', 'TimeDimensionValue', 'TimeDimensionBegin', 'TimeDimensionEnd',
       'age_group'],
      dtype='object')


Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd,age_group
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00,adult
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00,adult


#### <span style="color:#4272FF;">**3. Combine the two obesity datasets into one dataframe called <u>*df_obesity*</u>**</span>

In [20]:
print(obesity_adults.shape)
print(obesity_children.shape)

(20790, 26)
(62370, 26)


In [21]:
# concat two dataframe
df_obesity = pd.concat([obesity_adults, obesity_children], ignore_index=True, sort=False)

In [22]:
df_obesity.shape

(83160, 26)

In [23]:
df_obesity['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**4. Combine the two malnutrition datasets into one dataframe called <u>*df_malnutrition*</u>**</span>

In [25]:
print(underweight_adults.shape)
print(underweight_children.shape)

(20790, 26)
(62370, 26)


In [26]:
# concat two dataframe
df_malnutrition = pd.concat([underweight_adults, underweight_children], ignore_index=True, sort=False)

In [27]:
df_malnutrition.shape

(83160, 26)

In [28]:
df_malnutrition['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**5. Filter each dataset to include only records from the years <u>*2012 to 2022*</u>**</span>

In [30]:
df_obesity = df_obesity[(df_obesity['TimeDim'] >= 2012) & (df_obesity['TimeDim'] <= 2022)]

In [31]:
print(df_obesity.shape)
df_obesity['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64

In [32]:
df_malnutrition = df_malnutrition[(df_malnutrition['TimeDim'] >= 2012) & (df_malnutrition['TimeDim'] <= 2022)]

In [33]:
print(df_malnutrition.shape)
df_malnutrition['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64

### <span style="color:#2E6F40;">**Step 2: 🧹 Data Cleaning & Feature Engineering**</span>

#### <span style="color:#4272FF;">**Keep necessary columns**</span>

In [36]:
df_obesity.columns

Index(['Id', 'IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDimType',
       'ParentLocationCode', 'ParentLocation', 'Dim1Type', 'TimeDim', 'Dim1',
       'Dim2Type', 'Dim2', 'Dim3Type', 'Dim3', 'DataSourceDimType',
       'DataSourceDim', 'Value', 'NumericValue', 'Low', 'High', 'Comments',
       'Date', 'TimeDimensionValue', 'TimeDimensionBegin', 'TimeDimensionEnd',
       'age_group'],
      dtype='object')

In [37]:
df_obesity = df_obesity[['ParentLocation', 'Dim1', 'TimeDim', 'Low', 'High', 'NumericValue', 'SpatialDim', 'age_group']]
df_obesity.sample(2)

Unnamed: 0,ParentLocation,Dim1,TimeDim,Low,High,NumericValue,SpatialDim,age_group
67245,Eastern Mediterranean,SEX_FMLE,2015,6.16674,11.07013,8.4271,PSE,children
32816,Eastern Mediterranean,SEX_FMLE,2021,8.87601,21.69887,14.90747,OMN,children


In [38]:
df_malnutrition = df_malnutrition[['ParentLocation', 'Dim1', 'TimeDim', 'Low', 'High', 'NumericValue', 'SpatialDim', 'age_group']]
df_obesity.sample(2)

Unnamed: 0,ParentLocation,Dim1,TimeDim,Low,High,NumericValue,SpatialDim,age_group
38934,Western Pacific,SEX_MLE,2013,7.73364,24.10121,15.17648,SGP,children
46523,Europe,SEX_MLE,2014,7.75662,12.29987,9.84622,BGR,children


#### <span style="color:#4272FF;">**Rename Columns**</span>

In [40]:
new_columns = {'TimeDim': 'Year', 'Dim1': 'Gender', 'NumericValue': 'Mean_Estimate',
               'Low': 'LowerBound', 'High': 'UpperBound', 'ParentLocation': 'Region',
              'SpatialDim': 'Country'}
df_obesity = df_obesity.rename(columns=new_columns)
df_obesity.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
5057,Africa,SEX_BTSX,2021,4.20547,6.07868,5.08075,NER,adult
2597,Europe,SEX_BTSX,2018,30.12629,33.72573,31.86981,TUR,adult


In [41]:
df_malnutrition = df_malnutrition.rename(columns=new_columns)
df_malnutrition.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
19153,,SEX_BTSX,2015,11.55091,12.59849,12.0706,AFR,adult
56529,Americas,SEX_FMLE,2017,0.16926,2.857,0.95025,BOL,children


#### <span style="color:#4272FF;">**Convert Country Codes to Full Names using <u>*pycountry*</u>**</span>

In [43]:
# Define a function to convert the 3-letter codes to full names
def country_conversion(code):
    special_cases = {
                    'GLOBAL': 'Global',
                    'WB_LMI': 'Low & Middle Income',
                    'WB_HI': 'High Income',
                    'WB_LI': 'Low Income',
                    'EMR': 'Eastern Mediterranean Region',
                    'EUR': 'Europe',
                    'AFR': 'Africa',
                    'SEAR': 'South-East Asia Region',
                    'WPR': 'Western Pacific Region',
                    'AMR': 'Americas Region',
                    'WB_UMI': 'Upper Middle Income'}
    if code in special_cases:
        return special_cases[code]
    else:
        country_details = pycountry.countries.get(alpha_3 = code)
        country_name = country_details.name
        return country_name

In [44]:
df_obesity['Country'] = df_obesity['Country'].apply(country_conversion)

In [45]:
df_obesity.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
12242,Europe,SEX_FMLE,2020,21.41272,32.91575,26.84728,Slovakia,adult
71838,Americas,SEX_MLE,2017,7.60953,33.7839,19.25124,Dominica,children


In [46]:
df_malnutrition['Country'] = df_malnutrition['Country'].apply(country_conversion)

In [47]:
df_malnutrition.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
5556,Americas,SEX_BTSX,2015,2.70096,6.58473,4.4079,Grenada,adult
19757,Americas,SEX_BTSX,2019,2.49027,5.70946,3.90661,Saint Vincent and the Grenadines,adult


#### <span style="color:#4272FF;">**New Columns Creation**</span>

- **CI_Width column creation**

In [50]:
df_obesity['CI_Width'] = df_obesity['UpperBound'] - df_obesity['LowerBound']
df_malnutrition['CI_Width'] = df_malnutrition['UpperBound'] - df_malnutrition['LowerBound']

- **obesity_level column creation - (for the obesity table only)**

In [52]:
# Define function to categorize the obesity_level
def obesity_level(value):
    if value >= 30:
        return 'High'
    elif value < 25:
        return 'Low'
    else:
        return 'Moderate'

In [53]:
df_obesity['Obesity_level'] = df_obesity['Mean_Estimate'].apply(obesity_level)
df_obesity.head()

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
0,Europe,SEX_BTSX,2018,23.92528,28.04794,25.94346,"Moldova, Republic of",adult,4.12266,Moderate
5,Eastern Mediterranean,SEX_MLE,2020,15.79378,19.24573,17.48764,"Iran, Islamic Republic of",adult,3.45195,Low
12,Eastern Mediterranean,SEX_MLE,2020,23.08959,33.81534,28.33553,Iraq,adult,10.72575,Moderate
18,Europe,SEX_FMLE,2014,24.92477,31.21326,27.9487,Greenland,adult,6.28849,Moderate
20,Europe,SEX_FMLE,2019,28.16068,35.57493,31.89322,Armenia,adult,7.41425,High


- **Malnutrition_Level column creation - (for the malnutrition table only)**

In [55]:
# Define function to categorize the Malnutrition_Level
def malnutrition_level(value):
    if value >= 20:
        return 'High'
    elif value < 10:
        return 'Low'
    else:
        return 'Moderate'

In [56]:
df_malnutrition['Malnutrition_Level'] = df_malnutrition['Mean_Estimate'].apply(malnutrition_level)
df_malnutrition.head()

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
1,Africa,SEX_BTSX,2020,9.70523,12.91453,11.27629,Uganda,adult,3.2093,Moderate
3,Africa,SEX_FMLE,2015,8.24333,12.07013,10.0801,Mali,adult,3.8268,Moderate
4,Africa,SEX_FMLE,2021,3.97204,8.07702,5.79511,Gabon,adult,4.10498,Low
8,Western Pacific,SEX_BTSX,2017,2.42823,3.84215,3.09065,Mongolia,adult,1.41392,Low
9,Africa,SEX_FMLE,2016,6.65567,10.25312,8.37172,Sierra Leone,adult,3.59745,Low


#### <span style="color:#4272FF;">**Feature Engineering For Gender Column**</span>

In [58]:
df_obesity.Gender.value_counts()

Gender
SEX_BTSX    9240
SEX_MLE     9240
SEX_FMLE    9240
Name: count, dtype: int64

In [59]:
df_obesity['Gender'] = df_obesity['Gender'].apply(lambda x: 'Male' if x == 'SEX_MLE' else
                                                  'Female' if x == 'SEX_FMLE' else 'Both')

In [60]:
df_obesity.Gender.value_counts()

Gender
Both      9240
Male      9240
Female    9240
Name: count, dtype: int64

In [61]:
df_obesity.head(3)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
0,Europe,Both,2018,23.92528,28.04794,25.94346,"Moldova, Republic of",adult,4.12266,Moderate
5,Eastern Mediterranean,Male,2020,15.79378,19.24573,17.48764,"Iran, Islamic Republic of",adult,3.45195,Low
12,Eastern Mediterranean,Male,2020,23.08959,33.81534,28.33553,Iraq,adult,10.72575,Moderate


In [62]:
df_malnutrition['Gender'] = df_malnutrition['Gender'].apply(lambda x: 'Male' if x == 'SEX_MLE' else
                                                            'Female' if x == 'SEX_FMLE' else 'Both')

In [63]:
df_malnutrition.Gender.value_counts()

Gender
Both      9240
Female    9240
Male      9240
Name: count, dtype: int64

In [64]:
df_malnutrition.head(3)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
1,Africa,Both,2020,9.70523,12.91453,11.27629,Uganda,adult,3.2093,Moderate
3,Africa,Female,2015,8.24333,12.07013,10.0801,Mali,adult,3.8268,Moderate
4,Africa,Female,2021,3.97204,8.07702,5.79511,Gabon,adult,4.10498,Low
