In [18]:
import re
import numpy as np
import pandas as pd

# ===== 1) Load raw =====
df_raw = pd.read_csv("nutrition.csv")

# ===== 2) Helpers =====
def extract_number(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    s = str(x).strip().lower().replace(",", "")
    m = re.search(r'[-+]?\d*\.?\d+(?:e[-+]?\d+)?', s)
    return float(m.group(0)) if m else np.nan

def to_grams(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    s = str(x).strip().lower().replace(",", "")
    m = re.search(r'[-+]?\d*\.?\d+(?:e[-+]?\d+)?', s)
    num = float(m.group(0)) if m else np.nan
    if np.isnan(num):
        return np.nan
    if "mcg" in s or "µg" in s:
        return num / 1_000_000.0
    if "mg" in s:
        return num / 1_000.0
    return num

# ===== 3) Clean names =====
df = df_raw.copy()
df["name_fixed"] = df["name"].astype(str).str.strip() if "name" in df.columns else ""

# ===== 4) Core numeric per-100g features =====
# Calories
cal_cols = [c for c in df.columns if "calorie" in c.lower()]
if "calories" in df.columns:
    df["calories_per100g"] = pd.to_numeric(df["calories"], errors="coerce")
elif cal_cols:
    df["calories_per100g"] = pd.to_numeric(df[cal_cols[0]], errors="coerce")
else:
    df["calories_per100g"] = np.nan

# Protein / Fat / Sugar columns (prefer detailed columns measured in grams)
prot_col = next((c for c in df.columns if "protein" in c.lower()), None)
fat_col  = next((c for c in df.columns if c.lower()=="fat"), None)
sugar_col = next((c for c in df.columns if "sugars" in c.lower()), None)

df["protein_per100g"] = df[prot_col].map(to_grams) if prot_col else np.nan
df["fat_per100g"]     = df[fat_col].map(to_grams)  if fat_col  else np.nan
df["sugar_per100g"]   = df[sugar_col].map(to_grams) if sugar_col else np.nan

# ===== 5) Clean negatives and light impute medians (keeps true zeros) =====
for c in ["protein_per100g","fat_per100g","sugar_per100g"]:
    if c in df:
        df.loc[df[c] < 0, c] = np.nan

for c in ["calories_per100g","protein_per100g","fat_per100g","sugar_per100g"]:
    if c in df and df[c].notna().any():
        df[c] = df[c].fillna(df[c].median())

# Preview
print(df[["name_fixed","calories_per100g","protein_per100g","fat_per100g","sugar_per100g"]].head(8))

globals().update({"df": df})


          name_fixed  calories_per100g  protein_per100g  fat_per100g  \
0         Cornstarch               381             0.26         0.05   
1       Nuts, pecans               691             9.17        71.97   
2      Eggplant, raw                25             0.98         0.18   
3     Teff, uncooked               367            13.30         2.38   
4    Sherbet, orange               144             1.10         2.00   
5   Cauliflower, raw                25             1.92         0.28   
6   Taro leaves, raw                42             4.98         0.74   
7  Lamb, raw, ground               282            16.56        23.41   

   sugar_per100g  
0           0.00  
1           3.97  
2           3.53  
3           1.84  
4          24.32  
5           1.91  
6           3.01  
7           0.00  


In [19]:
healthy_like_patterns = [
    r"\bfruit", r"\bfruits", r"\bberry", r"\bberries",
    r"\bgrain", r"\bwhole\s*grain", r"\bcereal", r"\boat", r"\boats\b", r"\boatmeal",
    r"\bwhole\s*wheat", r"\bbrown\s*rice", r"\bquinoa", r"\bbarley"
]
df["_cat_norm"] = df["name_fixed"].astype(str).str.lower()
healthy_mask = df["_cat_norm"].str.contains("|".join(healthy_like_patterns), regex=True, na=False)

q1 = df.loc[healthy_mask, ["name_fixed","sugar_per100g","protein_per100g","fat_per100g"]]
q1.sort_values("sugar_per100g", ascending=False).head(15)


Unnamed: 0,name_fixed,sugar_per100g,protein_per100g,fat_per100g
535,Candied fruit,80.68,0.34,0.07
6570,"Beverages, powder, orange flavor, GATORADE, QU...",80.5,0.0,1.23
6700,"Candies, SKITTLES Wild Berry Bite Size Candies...",75.92,0.19,4.25
5557,"Babyfood, Real Fruit Bars, GERBER GRADUATE FRU...",68.65,0.82,2.24
3150,"Candies, with high vitamin C, fruit snacks",68.18,0.08,0.0
79,"Syrup, fruit flavored",65.0,0.0,0.02
6607,"Candies, Tropical fruits, STARBURST Fruit Chew...",58.18,0.41,8.31
6590,"Candies, Original fruits, STARBURST Fruit Chew...",58.12,0.41,8.21
6573,"Candies, Fruit and Creme, STARBURST Fruit Chew...",57.98,0.41,8.36
2005,"Snacks, pieces, fruit leather",57.58,1.0,2.68


In [20]:
HIGH_PROTEIN_THRESHOLD = 20.0
HIGH_FAT_THRESHOLD = 10.0

high_protein = df.loc[df["protein_per100g"] >= HIGH_PROTEIN_THRESHOLD].copy()
hp_n = len(high_protein)
hp_highfat_n = int((high_protein["fat_per100g"] >= HIGH_FAT_THRESHOLD).sum())
hp_prop = (hp_highfat_n / hp_n) if hp_n > 0 else np.nan

print(f"จำนวน High-Protein: {hp_n}")
print(f"ในนี้ ไขมันสูง: {hp_highfat_n}")
print(f"สัดส่วนที่ไขมันสูงด้วย: {hp_prop:.1%}")


จำนวน High-Protein: 2179
ในนี้ ไขมันสูง: 846
สัดส่วนที่ไขมันสูงด้วย: 38.8%


In [22]:
corr = df["fat_per100g"].corr(df["sugar_per100g"])
print("Pearson correlation (fat vs sugar):", corr)


Pearson correlation (fat vs sugar): -8.20654025606272e-05
