# Exploratory Data Analysis - Price of Healthy Diet

## 1. Load Libraries and Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from pathlib import Path

ModuleNotFoundError: No module named 'pandas'

In [None]:
data = pd.read_csv("../data/raw/price_of_healthy_diet_clean.csv")

## 2. Helper Functions

##### 2.1 Image Export Helper

In [None]:
ROOT = Path.cwd().parent
IMG_DIR = ROOT / "images"
IMG_DIR.mkdir(exist_ok=True)

def save_png(name, fig=None, dpi=300):
    # saves to root/images/<name>.png
    path = IMG_DIR / f"{name}.png"
    (fig or plt.gcf()).savefig(path, dpi=dpi, bbox_inches="tight")
    return path


## 3. EDA

##### 3.1 General Overview

In [None]:
data.head()

In [None]:
data.info()

In [None]:
null_summary = pd.DataFrame({
    "Data Type": data.dtypes,
    "Missing Count": data.isnull().sum(),
    "Missing %": (data.isnull().mean() * 100)
})

null_summary = null_summary.sort_values("Missing Count", ascending=False)

null_summary


In [None]:
# Numeric columns
numeric_cols = data.select_dtypes(include=["int64", "float64"]).columns

# Categorical columns
categorical_cols = data.select_dtypes(include=["object", "category"]).columns

print("Numeric Columns:")
print(numeric_cols)

print("\nCategorical Columns:")
print(categorical_cols)


In [None]:
# Numeric summary
display(data[numeric_cols].describe())

# Categorical summary
display(data[categorical_cols].describe())


#### 3.2 Numerical Feature Analysis

In [None]:


num_cols = len(numeric_cols)
cols = 2
rows = (num_cols // cols) + (num_cols % cols)

fig, axes = plt.subplots(rows, cols, figsize=(12, 5*rows))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(data[col], kde=True, ax=axes[i])
    axes[i].set_title(f"Distribution of {col}")

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
save_png("numerical_distributions", fig=fig)
plt.show()


#### 3.3 Categorical Feature Analysis

In [None]:
# country: plot alone with dynamic height + readable labels
counts = data["country"].value_counts()

fig_h = max(6, 0.28 * len(counts))  # grows with number of countries
fig, ax = plt.subplots(figsize=(12, fig_h))

counts.plot(kind="barh", ax=ax)
ax.set_title("Count of country")
ax.tick_params(axis="y", labelsize=8)

plt.tight_layout()
plt.subplots_adjust(left=0.40)  # more space for long country names
save_png("count_of_country", fig=fig)
plt.show()

In [None]:
count_cols = ["region", "cost_category"] # excluded data_quality since there's only one category

for col in count_cols:
    counts = data[col].value_counts()

    fig_h = max(5, 0.35 * len(counts))
    fig, ax = plt.subplots(figsize=(12, fig_h))

    counts.plot(kind="barh", ax=ax)
    ax.set_title(f"Count of {col}")
    ax.tick_params(axis="y", labelsize=9)

    plt.tight_layout()
    plt.subplots_adjust(left=0.35)

    save_png(f"count_of_{col}", fig=fig)
    plt.show()


In [None]:
countries = ["Mexico", "Canada", "Guatemala", "Brazil", "Argentina", "Chile", "Uganda"]

chart = alt.Chart(data[data["country"].isin(countries)]).mark_line().encode(
    x="year:O",
    y="cost_healthy_diet_ppp_usd:Q",
    color="country:N"
).properties(
    title="Cost of a Healthy Diet Over Time (Selected Countries)",
    width=600,
    height=400
)

chart.save(str(IMG_DIR / "cost_healthy_diet_over_time_selected_countries.png"))
chart

In [None]:
chart2 = alt.Chart(data).mark_boxplot().encode(
    x="year:O",
    y="cost_healthy_diet_ppp_usd:Q",
    color="region:N",
    column="region:N"
).properties(
    width=150,
    height=400
)

chart2.save(str(IMG_DIR / "cost_healthy_diet_boxplot_by_year_faceted_by_region.png"))
chart2

In [None]:
chart3 = alt.Chart(data).transform_density(
    "cost_healthy_diet_ppp_usd",
    as_=["cost", "density"],
    groupby=["region"]
).mark_area(orient="horizontal").encode(
    x="density:Q",
    y="cost:Q",
    color="region:N",
    row="region:N"
).properties(
    width=400,
    height=100
)

chart3.save(str(IMG_DIR / "cost_healthy_diet_density_by_region.png"))
chart3

In [None]:
chart4 = alt.Chart(data).mark_boxplot().encode(
    x="region:N",
    y="cost_healthy_diet_ppp_usd:Q",
    color="region:N"
).properties(
    title="Cost of a Healthy Diet by Region",
    width=600,
    height=400
)

chart4.save(str(IMG_DIR / "cost_healthy_diet_boxplot_by_region.png"))
chart4