### <span style="color:#2E6F40;">**Importing the Dependencies**</span>

In [2]:
import requests
import pandas as pd

### <span style="color:#2E6F40;">**📥 Data Collection**</span>
We are going to collect data from **4 public WHO API endpoints**:

1. **Obesity data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_30C`](https://ghoapi.azureedge.net/api/NCD_BMI_30C)

2. **Obesity data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C)

3. **Underweight data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_18C`](https://ghoapi.azureedge.net/api/NCD_BMI_18C)

4. **Underweight data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C)

### <span style="color:#2E6F40;">**Define function for converting API data into dataframe**</span>

In [5]:
def convert_dataframe(url):
    result = requests.get(url)
    data = result.json()
    df = pd.DataFrame(data['value'])
    return df

### <span style="color:#2E6F40;">**1. Load all 4 datasets into dataframe**</span>

#### <span style="color:#4272FF;">**1. Obesity data for adults**</span>

In [8]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_30C"
obesity_adults = convert_dataframe(url)
obesity_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**2. Obesity data for children**</span>

In [10]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C"
obesity_children = convert_dataframe(url)
obesity_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540919,NCD_BMI_PLUS2C,COUNTRY,CUB,YEAR,AMR,Americas,SEX,2021,SEX_MLE,...,,12.5 [8.5-17.2],12.52724,8.46905,17.19727,,2024-02-29T16:06:41.017+01:00,2021,2021-01-01T00:00:00+01:00,2021-12-31T00:00:00+01:00
1,5540986,NCD_BMI_PLUS2C,COUNTRY,PNG,YEAR,WPR,Western Pacific,SEX,2000,SEX_MLE,...,,8.0 [1.7-19.8],8.01665,1.68665,19.76374,,2024-02-29T16:06:41.017+01:00,2000,2000-01-01T00:00:00+01:00,2000-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**3. Underweight data for adults**</span>

In [12]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_18C"
underweight_adults = convert_dataframe(url)
underweight_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540762,NCD_BMI_18C,COUNTRY,PER,AMR,YEAR,Americas,SEX,2011,SEX_FMLE,...,,1.6 [1.4-1.8],1.63089,1.43569,1.8362,,2024-02-29T16:06:41.017+01:00,2011,2011-01-01T00:00:00+01:00,2011-12-31T00:00:00+01:00
1,5541121,NCD_BMI_18C,COUNTRY,UGA,AFR,YEAR,Africa,SEX,2020,SEX_BTSX,...,,11.3 [9.7-12.9],11.27629,9.70523,12.91453,,2024-02-29T16:06:41.017+01:00,2020,2020-01-01T00:00:00+01:00,2020-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**4. Underweight data for children**</span>

In [14]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C"
underweight_children = convert_dataframe(url)
underweight_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,Dim1,TimeDim,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540609,NCD_BMI_MINUS2C,COUNTRY,IRL,EUR,YEAR,Europe,SEX,SEX_FMLE,1998,...,,0.8 [0.4-1.4],0.83126,0.41813,1.43478,,2024-02-29T16:06:41.017+01:00,1998,1998-01-01T00:00:00+01:00,1998-12-31T00:00:00+01:00
1,5540614,NCD_BMI_MINUS2C,COUNTRY,RUS,EUR,YEAR,Europe,SEX,SEX_FMLE,2010,...,,3.4 [2.6-4.3],3.40803,2.61038,4.28252,,2024-02-29T16:06:41.017+01:00,2010,2010-01-01T00:00:00+01:00,2010-12-31T00:00:00+01:00


### <span style="color:#2E6F40;">**🔄 Preprocessing Steps**</span>

#### <span style="color:#4272FF;">**2. Add a new column <u>*age_group*</u> to distinguish adults and children**</span>

In [17]:
obesity_adults['age_group'] = 'adult'
obesity_children['age_group'] = 'children'
underweight_adults['age_group'] = 'adult'
underweight_children['age_group'] = 'children'

In [18]:
# verify one df
print(obesity_adults.columns)
obesity_adults.head(2)

Index(['Id', 'IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDimType',
       'ParentLocationCode', 'ParentLocation', 'Dim1Type', 'TimeDim', 'Dim1',
       'Dim2Type', 'Dim2', 'Dim3Type', 'Dim3', 'DataSourceDimType',
       'DataSourceDim', 'Value', 'NumericValue', 'Low', 'High', 'Comments',
       'Date', 'TimeDimensionValue', 'TimeDimensionBegin', 'TimeDimensionEnd',
       'age_group'],
      dtype='object')


Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd,age_group
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00,adult
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00,adult


#### <span style="color:#4272FF;">**3. Combine the two obesity datasets into one dataframe called <u>*df_obesity*</u>**</span>

In [20]:
print(obesity_adults.shape)
print(obesity_children.shape)

(20790, 26)
(62370, 26)


In [21]:
# concat two dataframe
df_obesity = pd.concat([obesity_adults, obesity_children], ignore_index=True, sort=False)

In [22]:
df_obesity.shape

(83160, 26)

In [23]:
df_obesity['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**4. Combine the two malnutrition datasets into one dataframe called <u>*df_malnutrition*</u>**</span>

In [25]:
print(underweight_adults.shape)
print(underweight_children.shape)

(20790, 26)
(62370, 26)


In [26]:
# concat two dataframe
df_malnutrition = pd.concat([underweight_adults, underweight_children], ignore_index=True, sort=False)

In [27]:
df_malnutrition.shape

(83160, 26)

In [28]:
df_malnutrition['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**5. Filter each dataset to include only records from the years <u>*2012 to 2022*</u>**</span>

In [30]:
df_obesity = df_obesity[(df_obesity['TimeDim'] >= 2012) & (df_obesity['TimeDim'] <= 2022)]

In [31]:
print(df_obesity.shape)
df_obesity['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64

In [32]:
df_malnutrition = df_malnutrition[(df_malnutrition['TimeDim'] >= 2012) & (df_malnutrition['TimeDim'] <= 2022)]

In [33]:
print(df_malnutrition.shape)
df_malnutrition['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64