In [1]:
import pandas as pd
import numpy as np
import os

file_in = "../data/raw/health/share-of-adults-defined-as-obese.csv"
file_out = "../data/interim/obesity_prevalence_clean.csv"

# load dataset
df = pd.read_csv(file_in)
df.head()

Unnamed: 0,Entity,Code,Year,"Prevalence of obesity among adults, BMI >= 30 (crude estimate) (%) - Sex: both sexes - Age group: 18+ years of age"
0,Afghanistan,AFG,1990,1.6808
1,Afghanistan,AFG,1991,1.81298
2,Afghanistan,AFG,1992,1.95725
3,Afghanistan,AFG,1993,2.11433
4,Afghanistan,AFG,1994,2.28502


In [2]:
# rename columns for clarity
df.columns = ['country', 'iso_code', 'year', 'obesity_rate']

# ensure correct data types
df['year'] = df['year'].astype(int)
df['obesity_rate'] = pd.to_numeric(df['obesity_rate'], errors='coerce')

# quick structure check
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       6798 non-null   object 
 1   iso_code      6600 non-null   object 
 2   year          6798 non-null   int64  
 3   obesity_rate  6798 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 212.6+ KB


Unnamed: 0,country,iso_code,year,obesity_rate
0,Afghanistan,AFG,1990,1.6808
1,Afghanistan,AFG,1991,1.81298
2,Afghanistan,AFG,1992,1.95725
3,Afghanistan,AFG,1993,2.11433
4,Afghanistan,AFG,1994,2.28502


In [3]:
# check missing values
print("Missing values (%):")
print((df.isna().mean() * 100).round(2))

# summary of numeric values
print("\nðŸ“ŠSummary statistics:")
print(df['obesity_rate'].describe())

# identify unrealistic or invalid values (since it's a percentage)
outliers = df[(df['obesity_rate'] < 0) | (df['obesity_rate'] > 100)]
print(f"\nUnusual entries found: {outliers.shape[0]}")
outliers.head()


Missing values (%):
country         0.00
iso_code        2.91
year            0.00
obesity_rate    0.00
dtype: float64

ðŸ“ŠSummary statistics:
count    6798.000000
mean       17.241858
std        13.324434
min         0.195100
25%         6.883873
50%        15.395280
75%        23.532728
max        75.559590
Name: obesity_rate, dtype: float64

Unusual entries found: 0


Unnamed: 0,country,iso_code,year,obesity_rate


In [4]:
# drop rows without ISO codes (regional/income aggregates)
df = df.dropna(subset=['iso_code']).reset_index(drop=True)

# confirm cleanup
print("Remaining missing iso_code entries:", df['iso_code'].isna().sum())
print("Dataset shape after cleanup:", df.shape)


Remaining missing iso_code entries: 0
Dataset shape after cleanup: (6600, 4)


In [5]:
# round the obesity rate to 2 decimal places
df['obesity_rate'] = df['obesity_rate'].round(2)

# sort neatly for analysis
df = df.sort_values(['iso_code', 'year']).reset_index(drop=True)

# save cleaned file
os.makedirs("../data/interim", exist_ok=True)
df.to_csv("../data/interim/obesity_prevalence_clean.csv", index=False)

print("Cleaned file saved as '../data/interim/obesity_prevalence_clean.csv'")


Cleaned file saved as '../data/interim/obesity_prevalence_clean.csv'
