### <span style="color:#2E6F40;">**Importing the Dependencies**</span>

In [2]:
import requests
import pandas as pd
import pycountry

### <span style="color:#2E6F40;">**Step 1: 📥 Dataset Overview & Collection**</span>
We are going to collect data from **4 public WHO API endpoints**:

1. **Obesity data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_30C`](https://ghoapi.azureedge.net/api/NCD_BMI_30C)

2. **Obesity data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C)

3. **Underweight data for adults**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_18C`](https://ghoapi.azureedge.net/api/NCD_BMI_18C)

4. **Underweight data for children**  
   ➤ URL: [`https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C`](https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C)

### <span style="color:#2E6F40;">**Define function for converting API data into dataframe**</span>

In [5]:
def convert_dataframe(url):
    result = requests.get(url)
    data = result.json()
    df = pd.DataFrame(data['value'])
    return df

### <span style="color:#2E6F40;">**1. Load all 4 datasets into dataframe**</span>

#### <span style="color:#4272FF;">**1. Obesity data for adults**</span>

In [8]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_30C"
obesity_adults = convert_dataframe(url)
obesity_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**2. Obesity data for children**</span>

In [10]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_PLUS2C"
obesity_children = convert_dataframe(url)
obesity_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540919,NCD_BMI_PLUS2C,COUNTRY,CUB,YEAR,AMR,Americas,SEX,2021,SEX_MLE,...,,12.5 [8.5-17.2],12.52724,8.46905,17.19727,,2024-02-29T16:06:41.017+01:00,2021,2021-01-01T00:00:00+01:00,2021-12-31T00:00:00+01:00
1,5540986,NCD_BMI_PLUS2C,COUNTRY,PNG,YEAR,WPR,Western Pacific,SEX,2000,SEX_MLE,...,,8.0 [1.7-19.8],8.01665,1.68665,19.76374,,2024-02-29T16:06:41.017+01:00,2000,2000-01-01T00:00:00+01:00,2000-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**3. Underweight data for adults**</span>

In [12]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_18C"
underweight_adults = convert_dataframe(url)
underweight_adults.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540762,NCD_BMI_18C,COUNTRY,PER,AMR,YEAR,Americas,SEX,2011,SEX_FMLE,...,,1.6 [1.4-1.8],1.63089,1.43569,1.8362,,2024-02-29T16:06:41.017+01:00,2011,2011-01-01T00:00:00+01:00,2011-12-31T00:00:00+01:00
1,5541121,NCD_BMI_18C,COUNTRY,UGA,AFR,YEAR,Africa,SEX,2020,SEX_BTSX,...,,11.3 [9.7-12.9],11.27629,9.70523,12.91453,,2024-02-29T16:06:41.017+01:00,2020,2020-01-01T00:00:00+01:00,2020-12-31T00:00:00+01:00


#### <span style="color:#4272FF;">**4. Underweight data for children**</span>

In [14]:
url = "https://ghoapi.azureedge.net/api/NCD_BMI_MINUS2C"
underweight_children = convert_dataframe(url)
underweight_children.head(2)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,ParentLocationCode,TimeDimType,ParentLocation,Dim1Type,Dim1,TimeDim,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,5540609,NCD_BMI_MINUS2C,COUNTRY,IRL,EUR,YEAR,Europe,SEX,SEX_FMLE,1998,...,,0.8 [0.4-1.4],0.83126,0.41813,1.43478,,2024-02-29T16:06:41.017+01:00,1998,1998-01-01T00:00:00+01:00,1998-12-31T00:00:00+01:00
1,5540614,NCD_BMI_MINUS2C,COUNTRY,RUS,EUR,YEAR,Europe,SEX,SEX_FMLE,2010,...,,3.4 [2.6-4.3],3.40803,2.61038,4.28252,,2024-02-29T16:06:41.017+01:00,2010,2010-01-01T00:00:00+01:00,2010-12-31T00:00:00+01:00


### <span style="color:#2E6F40;">**🔄 Preprocessing Steps**</span>

#### <span style="color:#4272FF;">**2. Add a new column <u>*age_group*</u> to distinguish adults and children**</span>

In [17]:
obesity_adults['age_group'] = 'adult'
obesity_children['age_group'] = 'children'
underweight_adults['age_group'] = 'adult'
underweight_children['age_group'] = 'children'

In [18]:
# verify one df
print(obesity_adults.columns)
obesity_adults.head(2)

Index(['Id', 'IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDimType',
       'ParentLocationCode', 'ParentLocation', 'Dim1Type', 'TimeDim', 'Dim1',
       'Dim2Type', 'Dim2', 'Dim3Type', 'Dim3', 'DataSourceDimType',
       'DataSourceDim', 'Value', 'NumericValue', 'Low', 'High', 'Comments',
       'Date', 'TimeDimensionValue', 'TimeDimensionBegin', 'TimeDimensionEnd',
       'age_group'],
      dtype='object')


Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd,age_group
0,5540695,NCD_BMI_30C,COUNTRY,MDA,YEAR,EUR,Europe,SEX,2018,SEX_BTSX,...,25.9 [23.9-28.0],25.94346,23.92528,28.04794,,2024-02-29T16:06:41.017+01:00,2018,2018-01-01T00:00:00+01:00,2018-12-31T00:00:00+01:00,adult
1,5541156,NCD_BMI_30C,COUNTRY,IRL,YEAR,EUR,Europe,SEX,1995,SEX_MLE,...,16.3 [13.6-19.2],16.26016,13.62936,19.19998,,2024-02-29T16:06:41.017+01:00,1995,1995-01-01T00:00:00+01:00,1995-12-31T00:00:00+01:00,adult


#### <span style="color:#4272FF;">**3. Combine the two obesity datasets into one dataframe called <u>*df_obesity*</u>**</span>

In [20]:
print(obesity_adults.shape)
print(obesity_children.shape)

(20790, 26)
(62370, 26)


In [21]:
# concat two dataframe
df_obesity = pd.concat([obesity_adults, obesity_children], ignore_index=True, sort=False)

In [22]:
df_obesity.shape

(83160, 26)

In [23]:
df_obesity['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**4. Combine the two malnutrition datasets into one dataframe called <u>*df_malnutrition*</u>**</span>

In [25]:
print(underweight_adults.shape)
print(underweight_children.shape)

(20790, 26)
(62370, 26)


In [26]:
# concat two dataframe
df_malnutrition = pd.concat([underweight_adults, underweight_children], ignore_index=True, sort=False)

In [27]:
df_malnutrition.shape

(83160, 26)

In [28]:
df_malnutrition['age_group'].value_counts()

age_group
children    62370
adult       20790
Name: count, dtype: int64

#### <span style="color:#4272FF;">**5. Filter each dataset to include only records from the years <u>*2012 to 2022*</u>**</span>

In [30]:
df_obesity = df_obesity[(df_obesity['TimeDim'] >= 2012) & (df_obesity['TimeDim'] <= 2022)]

In [31]:
print(df_obesity.shape)
df_obesity['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64

In [32]:
df_malnutrition = df_malnutrition[(df_malnutrition['TimeDim'] >= 2012) & (df_malnutrition['TimeDim'] <= 2022)]

In [33]:
print(df_malnutrition.shape)
df_malnutrition['age_group'].value_counts()

(27720, 26)


age_group
children    20790
adult        6930
Name: count, dtype: int64

### <span style="color:#2E6F40;">**Step 2: 🧹 Data Cleaning & Feature Engineering**</span>

#### <span style="color:#4272FF;">**Keep necessary columns**</span>

In [36]:
df_obesity.columns

Index(['Id', 'IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDimType',
       'ParentLocationCode', 'ParentLocation', 'Dim1Type', 'TimeDim', 'Dim1',
       'Dim2Type', 'Dim2', 'Dim3Type', 'Dim3', 'DataSourceDimType',
       'DataSourceDim', 'Value', 'NumericValue', 'Low', 'High', 'Comments',
       'Date', 'TimeDimensionValue', 'TimeDimensionBegin', 'TimeDimensionEnd',
       'age_group'],
      dtype='object')

In [37]:
df_obesity = df_obesity[['ParentLocation', 'Dim1', 'TimeDim', 'Low', 'High', 'NumericValue', 'SpatialDim', 'age_group']]
df_obesity.sample(2)

Unnamed: 0,ParentLocation,Dim1,TimeDim,Low,High,NumericValue,SpatialDim,age_group
31868,Americas,SEX_MLE,2019,8.09706,30.3573,17.94467,VCT,children
37026,Eastern Mediterranean,SEX_BTSX,2021,11.35617,28.95621,19.75668,LBN,children


In [38]:
df_malnutrition = df_malnutrition[['ParentLocation', 'Dim1', 'TimeDim', 'Low', 'High', 'NumericValue', 'SpatialDim', 'age_group']]
df_obesity.sample(2)

Unnamed: 0,ParentLocation,Dim1,TimeDim,Low,High,NumericValue,SpatialDim,age_group
73802,Americas,SEX_FMLE,2021,4.85094,29.65692,15.27294,GUY,children
76394,Africa,SEX_BTSX,2018,1.66114,10.44156,4.91432,ZMB,children


#### <span style="color:#4272FF;">**Rename Columns**</span>

In [40]:
new_columns = {'TimeDim': 'Year', 'Dim1': 'Gender', 'NumericValue': 'Mean_Estimate',
               'Low': 'LowerBound', 'High': 'UpperBound', 'ParentLocation': 'Region',
              'SpatialDim': 'Country'}
df_obesity = df_obesity.rename(columns=new_columns)
df_obesity.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
38566,Africa,SEX_FMLE,2014,1.67206,4.04519,2.69333,CIV,children
75399,Africa,SEX_MLE,2014,0.38897,12.84693,4.02581,GNB,children


In [41]:
df_malnutrition = df_malnutrition.rename(columns=new_columns)
df_malnutrition.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
80709,South-East Asia,SEX_FMLE,2020,5.72284,12.06242,8.61816,NPL,children
16494,Americas,SEX_MLE,2020,1.88726,3.99578,2.83758,COL,adult


#### <span style="color:#4272FF;">**Convert Country Codes to Full Names using <u>*pycountry*</u>**</span>

In [43]:
# Define a function to convert the 3-letter codes to full names
def country_conversion(code):
    special_cases = {
                    'GLOBAL': 'Global',
                    'WB_LMI': 'Low & Middle Income',
                    'WB_HI': 'High Income',
                    'WB_LI': 'Low Income',
                    'EMR': 'Eastern Mediterranean Region',
                    'EUR': 'Europe',
                    'AFR': 'Africa',
                    'SEAR': 'South-East Asia Region',
                    'WPR': 'Western Pacific Region',
                    'AMR': 'Americas Region',
                    'WB_UMI': 'Upper Middle Income'}
    if code in special_cases:
        return special_cases[code]
    else:
        country_details = pycountry.countries.get(alpha_3 = code)
        country_name = country_details.name
        return country_name

In [44]:
df_obesity['Country'] = df_obesity['Country'].apply(country_conversion)

In [45]:
df_obesity.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
9776,Eastern Mediterranean,SEX_MLE,2021,35.23538,47.65034,41.36971,Kuwait,adult
37198,Africa,SEX_BTSX,2017,2.14679,5.04509,3.39606,Liberia,children


In [46]:
df_malnutrition['Country'] = df_malnutrition['Country'].apply(country_conversion)

In [47]:
df_malnutrition.sample(2)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group
40329,Europe,SEX_MLE,2015,0.55346,7.17548,2.67692,Andorra,children
56389,Americas,SEX_FMLE,2019,2.7303,8.91292,5.36263,Dominican Republic,children


#### <span style="color:#4272FF;">**New Columns Creation**</span>

- **CI_Width column creation**

In [50]:
df_obesity['CI_Width'] = df_obesity['UpperBound'] - df_obesity['LowerBound']
df_malnutrition['CI_Width'] = df_malnutrition['UpperBound'] - df_malnutrition['LowerBound']

- **obesity_level column creation - (for the obesity table only)**

In [52]:
# Define function to categorize the obesity_level
def obesity_level(value):
    if value >= 30:
        return 'High'
    elif value < 25:
        return 'Low'
    else:
        return 'Moderate'

In [53]:
df_obesity['Obesity_level'] = df_obesity['Mean_Estimate'].apply(obesity_level)
df_obesity.head()

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
0,Europe,SEX_BTSX,2018,23.92528,28.04794,25.94346,"Moldova, Republic of",adult,4.12266,Moderate
5,Eastern Mediterranean,SEX_MLE,2020,15.79378,19.24573,17.48764,"Iran, Islamic Republic of",adult,3.45195,Low
12,Eastern Mediterranean,SEX_MLE,2020,23.08959,33.81534,28.33553,Iraq,adult,10.72575,Moderate
18,Europe,SEX_FMLE,2014,24.92477,31.21326,27.9487,Greenland,adult,6.28849,Moderate
20,Europe,SEX_FMLE,2019,28.16068,35.57493,31.89322,Armenia,adult,7.41425,High


- **Malnutrition_Level column creation - (for the malnutrition table only)**

In [55]:
# Define function to categorize the Malnutrition_Level
def malnutrition_level(value):
    if value >= 20:
        return 'High'
    elif value < 10:
        return 'Low'
    else:
        return 'Moderate'

In [56]:
df_malnutrition['Malnutrition_Level'] = df_malnutrition['Mean_Estimate'].apply(malnutrition_level)
df_malnutrition.head()

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
1,Africa,SEX_BTSX,2020,9.70523,12.91453,11.27629,Uganda,adult,3.2093,Moderate
3,Africa,SEX_FMLE,2015,8.24333,12.07013,10.0801,Mali,adult,3.8268,Moderate
4,Africa,SEX_FMLE,2021,3.97204,8.07702,5.79511,Gabon,adult,4.10498,Low
8,Western Pacific,SEX_BTSX,2017,2.42823,3.84215,3.09065,Mongolia,adult,1.41392,Low
9,Africa,SEX_FMLE,2016,6.65567,10.25312,8.37172,Sierra Leone,adult,3.59745,Low


#### <span style="color:#4272FF;">**Feature Engineering For Gender Column**</span>

In [58]:
df_obesity.Gender.value_counts()

Gender
SEX_BTSX    9240
SEX_MLE     9240
SEX_FMLE    9240
Name: count, dtype: int64

In [59]:
df_obesity['Gender'] = df_obesity['Gender'].apply(lambda x: 'Male' if x == 'SEX_MLE' else
                                                  'Female' if x == 'SEX_FMLE' else 'Both')

In [60]:
df_obesity.Gender.value_counts()

Gender
Both      9240
Male      9240
Female    9240
Name: count, dtype: int64

In [61]:
df_obesity.head(3)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
0,Europe,Both,2018,23.92528,28.04794,25.94346,"Moldova, Republic of",adult,4.12266,Moderate
5,Eastern Mediterranean,Male,2020,15.79378,19.24573,17.48764,"Iran, Islamic Republic of",adult,3.45195,Low
12,Eastern Mediterranean,Male,2020,23.08959,33.81534,28.33553,Iraq,adult,10.72575,Moderate


In [62]:
df_malnutrition['Gender'] = df_malnutrition['Gender'].apply(lambda x: 'Male' if x == 'SEX_MLE' else
                                                            'Female' if x == 'SEX_FMLE' else 'Both')

In [63]:
df_malnutrition.Gender.value_counts()

Gender
Both      9240
Female    9240
Male      9240
Name: count, dtype: int64

In [64]:
df_malnutrition.head(3)

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
1,Africa,Both,2020,9.70523,12.91453,11.27629,Uganda,adult,3.2093,Moderate
3,Africa,Female,2015,8.24333,12.07013,10.0801,Mali,adult,3.8268,Moderate
4,Africa,Female,2021,3.97204,8.07702,5.79511,Gabon,adult,4.10498,Low


### <span style="color:#2E6F40;">**Step:3 🧮 Exploratory Data Analysis (EDA)**</span>

#### <span style="color:#4272FF;">**Check the shape of the DataFrame**</span>

In [112]:
print(df_obesity.shape)
print(df_malnutrition.shape)

(27720, 10)
(27720, 10)


#### <span style="color:#4272FF;">**Preview the data**</span>
1. **Head**
2. **Tail**
3. **Sample**

- #### df_obesity

In [128]:
df_obesity.head() # First 5 or (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
0,Europe,Both,2018,23.92528,28.04794,25.94346,"Moldova, Republic of",adult,4.12266,Moderate
5,Eastern Mediterranean,Male,2020,15.79378,19.24573,17.48764,"Iran, Islamic Republic of",adult,3.45195,Low
12,Eastern Mediterranean,Male,2020,23.08959,33.81534,28.33553,Iraq,adult,10.72575,Moderate
18,Europe,Female,2014,24.92477,31.21326,27.9487,Greenland,adult,6.28849,Moderate
20,Europe,Female,2019,28.16068,35.57493,31.89322,Armenia,adult,7.41425,High


In [130]:
df_obesity.tail() # Last 5 or (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
83137,Europe,Both,2020,0.96418,2.19743,1.47593,Tajikistan,children,1.23325,Low
83140,Eastern Mediterranean,Female,2012,0.30204,5.87241,1.96619,Somalia,children,5.57037,Low
83148,Western Pacific,Male,2021,3.3259,34.87782,16.63489,Vanuatu,children,31.55192,Low
83151,Africa,Male,2014,0.23648,6.34118,2.03219,Madagascar,children,6.1047,Low
83154,Europe,Male,2017,10.08785,12.76015,11.39063,Finland,children,2.6723,Low


In [134]:
df_obesity.sample(3) # Random (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Obesity_level
76402,Western Pacific,Female,2012,4.57878,8.62847,6.40609,Fiji,children,4.04969,Low
38967,Africa,Male,2012,0.28401,10.51142,3.10012,Madagascar,children,10.22741,Low
15455,Africa,Male,2017,11.09321,13.94337,12.47608,South Africa,adult,2.85016,Low


- #### df_malnutrition

In [138]:
df_malnutrition.head() # First 5 or (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
1,Africa,Both,2020,9.70523,12.91453,11.27629,Uganda,adult,3.2093,Moderate
3,Africa,Female,2015,8.24333,12.07013,10.0801,Mali,adult,3.8268,Moderate
4,Africa,Female,2021,3.97204,8.07702,5.79511,Gabon,adult,4.10498,Low
8,Western Pacific,Both,2017,2.42823,3.84215,3.09065,Mongolia,adult,1.41392,Low
9,Africa,Female,2016,6.65567,10.25312,8.37172,Sierra Leone,adult,3.59745,Low


In [140]:
df_malnutrition.tail() # Last 5 or (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
83147,Americas,Female,2013,2.7955,5.71434,4.14069,Haiti,children,2.91884,Low
83149,Europe,Both,2018,5.42817,10.2184,7.57499,Tajikistan,children,4.79023,Low
83150,Western Pacific,Both,2016,0.62172,3.50706,1.70365,Papua New Guinea,children,2.88534,Low
83156,Western Pacific,Female,2020,0.06415,2.81018,0.73407,Samoa,children,2.74603,Low
83159,Europe,Male,2018,0.46517,7.30748,2.54279,Iceland,children,6.84231,Low


In [142]:
df_malnutrition.sample(3) # Random (n) rows

Unnamed: 0,Region,Gender,Year,LowerBound,UpperBound,Mean_Estimate,Country,age_group,CI_Width,Malnutrition_Level
6505,Americas,Both,2019,1.81775,3.32481,2.50573,Panama,adult,1.50706,Low
26365,Europe,Both,2021,1.144,2.06851,1.56495,Estonia,children,0.92451,Low
64922,Africa,Both,2013,14.58149,21.25852,17.87691,Senegal,children,6.67703,Moderate


#### <span style="color:#4272FF;">**Checking the basic information about the dataframe**</span>

In [145]:
df_obesity.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27720 entries, 0 to 83154
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         26268 non-null  object 
 1   Gender         27720 non-null  object 
 2   Year           27720 non-null  int64  
 3   LowerBound     27720 non-null  float64
 4   UpperBound     27720 non-null  float64
 5   Mean_Estimate  27720 non-null  float64
 6   Country        27720 non-null  object 
 7   age_group      27720 non-null  object 
 8   CI_Width       27720 non-null  float64
 9   Obesity_level  27720 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.3+ MB


In [147]:
df_malnutrition.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27720 entries, 1 to 83159
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Region              26268 non-null  object 
 1   Gender              27720 non-null  object 
 2   Year                27720 non-null  int64  
 3   LowerBound          27720 non-null  float64
 4   UpperBound          27720 non-null  float64
 5   Mean_Estimate       27720 non-null  float64
 6   Country             27720 non-null  object 
 7   age_group           27720 non-null  object 
 8   CI_Width            27720 non-null  float64
 9   Malnutrition_Level  27720 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.3+ MB


#### <span style="color:#4272FF;">**Basic statistics measures for the numeric columns**</span>

In [150]:
df_obesity.describe()

Unnamed: 0,Year,LowerBound,UpperBound,Mean_Estimate,CI_Width
count,27720.0,27720.0,27720.0,27720.0,27720.0
mean,2017.0,9.393155,17.068489,12.761041,7.675334
std,3.162335,9.893854,12.395013,10.858925,6.616899
min,2012.0,0.07754,0.54785,0.26567,0.2167
25%,2014.0,2.35443,7.875503,4.75396,2.926715
50%,2017.0,6.506535,13.70438,9.94454,5.480535
75%,2020.0,12.697755,23.98605,17.482418,10.33888
max,2022.0,74.36537,87.86851,80.60925,40.55406


In [152]:
df_malnutrition.describe()

Unnamed: 0,Year,LowerBound,UpperBound,Mean_Estimate,CI_Width
count,27720.0,27720.0,27720.0,27720.0,27720.0
mean,2017.0,3.397094,8.147889,5.354451,4.750795
std,3.162335,3.908061,6.443692,4.81122,4.257684
min,2012.0,0.0237,0.35483,0.17531,0.16668
25%,2014.0,1.053662,3.143305,2.039283,1.603222
50%,2017.0,1.907975,6.095555,3.59779,3.351095
75%,2020.0,4.186443,11.526302,7.335245,6.530875
max,2022.0,31.3654,43.68029,35.89411,26.70174


#### <span style="color:#4272FF;">**Check the Missing Values**</span>

In [155]:
df_obesity.isnull().sum()

Region           1452
Gender              0
Year                0
LowerBound          0
UpperBound          0
Mean_Estimate       0
Country             0
age_group           0
CI_Width            0
Obesity_level       0
dtype: int64

In [157]:
df_malnutrition.isnull().sum()

Region                1452
Gender                   0
Year                     0
LowerBound               0
UpperBound               0
Mean_Estimate            0
Country                  0
age_group                0
CI_Width                 0
Malnutrition_Level       0
dtype: int64

#### <span style="color:#4272FF;">**See unique values in categorical columns**</span>

In [162]:
df_obesity['Region'].value_counts()

Region
Europe                   6864
Africa                   6204
Americas                 4884
Western Pacific          3960
Eastern Mediterranean    2904
South-East Asia          1452
Name: count, dtype: int64

In [170]:
df_malnutrition['Region'].value_counts()

Region
Europe                   6864
Africa                   6204
Americas                 4884
Western Pacific          3960
Eastern Mediterranean    2904
South-East Asia          1452
Name: count, dtype: int64

#### <span style="color:#4272FF;">**Checking the duplicates**</span>

In [174]:
df_obesity.duplicated().sum()

0

In [176]:
df_malnutrition.duplicated().sum()

0

In [178]:
# concat to check the duplicate values
duplicate_check = pd.concat([df_obesity, df_malnutrition], ignore_index=True, sort=False)

In [180]:
duplicate_check.duplicated().sum()

0

#### <span style="color:#4272FF;">**Quick visualization**</span>

In [214]:
import pandas_profiling
df_obesity.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 10/10 [00:00<00:00, 183.23it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [216]:
df_malnutrition.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 10/10 [00:00<00:00, 174.42it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

