In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("nutrition_dataset.csv")

# Step 1: Check for missing values
print("Missing Values in Each Column:")
print(df.isnull().sum())

# Step 2: Check data types
print("\nData Types of Columns:")
print(df.dtypes)

# Step 3: Convert numeric columns to appropriate types
# Check for non-numeric values in Age, Height, Weight
print("\nChecking for Non-Numeric Values in Numeric Columns:")
for col in ["Age", "Height", "Weight"]:
    non_numeric = df[~df[col].astype(str).str.replace(".", "").str.isdigit()]
    if not non_numeric.empty:
        print(f"Non-numeric values in {col}:")
        print(non_numeric[col].unique())

# Convert to numeric, coerce errors to NaN
for col in ["Age", "Height", "Weight"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Check for NaN values after conversion
print("\nMissing Values After Numeric Conversion:")
print(df[["Age", "Height", "Weight"]].isnull().sum())

# Drop rows with NaN in numeric columns (if any)
df = df.dropna(subset=["Age", "Height", "Weight"])
print("\nNumber of Rows After Dropping NaN in Numeric Columns:", len(df))

Missing Values in Each Column:
Age                     0
Gender                  0
Height                  0
Weight                  0
Activity Level          0
Fitness Goal            0
Dietary Preference      0
Daily Calorie Target    0
Protein                 0
Carbohydrates           0
Fat                     0
Breakfast Suggestion    0
Lunch Suggestion        0
Dinner Suggestion       0
Snack Suggestion        0
dtype: int64

Data Types of Columns:
Age                     object
Gender                  object
Height                  object
Weight                  object
Activity Level          object
Fitness Goal            object
Dietary Preference      object
Daily Calorie Target    object
Protein                 object
Carbohydrates           object
Fat                     object
Breakfast Suggestion    object
Lunch Suggestion        object
Dinner Suggestion       object
Snack Suggestion        object
dtype: object

Checking for Non-Numeric Values in Numeric Columns:
Non-numeri

In [2]:
# Step 4: Check for illogical values in numeric columns
print("\nSummary of Numeric Columns:")
print(df[["Age", "Height", "Weight"]].describe())

# Remove illogical values (e.g., Age < 0 or > 100, Height < 100 or > 250, Weight < 30 or > 200)
df = df[(df["Age"] >= 0) & (df["Age"] <= 100)]
df = df[(df["Height"] >= 100) & (df["Height"] <= 250)]
df = df[(df["Weight"] >= 30) & (df["Weight"] <= 200)]
print("\nNumber of Rows After Removing Illogical Numeric Values:", len(df))

# Step 5: Check for duplicates and remove them
print("\nNumber of Duplicated Rows:", df.duplicated().sum())
df = df.drop_duplicates()
print("Number of Rows After Removing Duplicates:", len(df))

# Step 6: Check categorical features for inconsistencies
print("\nUnique Values in Categorical Features:")
print("Gender:", df["Gender"].unique())
print("Activity Level:", df["Activity Level"].unique())
print("Fitness Goal:", df["Fitness Goal"].unique())
print("Dietary Preference:", df["Dietary Preference"].unique())


Summary of Numeric Columns:
              Age      Height      Weight
count  498.000000  498.000000  498.000000
mean    41.210843  172.636546   76.287149
std     15.151207    9.686973   14.692416
min     19.000000  155.000000   48.000000
25%     28.000000  165.000000   65.000000
50%     38.000000  170.000000   75.000000
75%     55.000000  180.000000   90.000000
max     72.000000  190.000000  110.000000

Number of Rows After Removing Illogical Numeric Values: 498

Number of Duplicated Rows: 0
Number of Rows After Removing Duplicates: 498

Unique Values in Categorical Features:
Gender: ['Male' 'Female']
Activity Level: ['Moderately Active' 'Lightly Active' 'Sedentary' 'Very Active']
Fitness Goal: ['Weight Loss' 'Maintenance' 'Muscle Gain' 'Weight Maintenance']
Dietary Preference: ['Omnivore' 'Vegetarian' 'Vegan']


In [3]:
# Step 7: Analyze the target (Breakfast Suggestion)
print("\nTarget (Breakfast Suggestion) Value Counts:")
print(df["Breakfast Suggestion"].value_counts())
print("\nAll Unique Breakfast Suggestions:")
print(df["Breakfast Suggestion"].unique())


Target (Breakfast Suggestion) Value Counts:
Breakfast Suggestion
Tofu scramble with vegetables                              43
Oatmeal with fruit and nuts                                38
Greek yogurt with fruit and granola                        32
Greek yogurt with berries and granola                      30
Oatmeal with berries and nuts                              25
                                                           ..
Oatmeal with protein powder and banana                      1
Tofu and vegetable scramble with avocado toast              1
Scrambled eggs with whole wheat toast and smoked salmon     1
Scrambled eggs with spinach and whole wheat toast           1
Eggs with whole grain toast                                 1
Name: count, Length: 117, dtype: int64

All Unique Breakfast Suggestions:
['Oatmeal with berries and nuts' 'Tofu scramble with veggies'
 'Tofu and veggie breakfast burrito' 'Greek yogurt with granola and fruit'
 'Scrambled eggs with whole wheat toast a

In [4]:
# Standardize Breakfast Suggestion (e.g., lowercase and strip whitespace)
df["Breakfast Suggestion"] = df["Breakfast Suggestion"].str.lower().str.strip()

# Group similar breakfasts (example grouping, adjust based on output)
breakfast_mapping_fixed = {
    # Oatmeal-based
    "oatmeal with berries and nuts": "oatmeal-based",
    "oatmeal with fruit and nuts": "oatmeal-based",
    "oatmeal with protein powder and fruit": "oatmeal-based",
    "oatmeal with protein powder and berries": "oatmeal-based",
    "oatmeal with protein powder and banana": "oatmeal-based",
    "oatmeal with berries and plant-based milk": "oatmeal-based",
    "oatmeal with berries and flax seeds": "oatmeal-based",
    "oatmeal with plant-based milk and fruit": "oatmeal-based",
    "oatmeal with protein powder": "oatmeal-based",
    "overnight oats with fruit and chia seeds": "oatmeal-based",
    "overnight oats with chia seeds and fruit": "oatmeal-based",
    "overnight oats with berries and nuts": "oatmeal-based",
    "overnight oats with berries and chia seeds": "oatmeal-based",
    "vegan overnight oats with berries": "oatmeal-based",
    "vegan overnight oats with chia seeds and berries": "oatmeal-based",
    "1 cup oatmeal with berries and nuts": "oatmeal-based",
    "overnight oats with fruit and nuts": "oatmeal-based",

    # Tofu-based
    "tofu scramble with veggies": "tofu-based",
    "tofu and veggie breakfast burrito": "tofu-based",
    "tofu and chickpea scramble": "tofu-based",
    "tofu and vegetable stir-fry": "tofu-based",
    "tofu scramble with vegetables": "tofu-based",
    "tofu scramble with vegetables and avocado": "tofu-based",
    "tofu scramble with spinach and avocado": "tofu-based",
    "tofu scramble with whole-wheat toast": "tofu-based",
    "tofu and vegetable breakfast burrito": "tofu-based",
    "tofu scramble with spinach and avocado toast": "tofu-based",
    "tofu scramble with vegetables and whole-wheat toast": "tofu-based",
    "tofu scramble with whole wheat toast and avocado": "tofu-based",
    "tofu and vegetable stir-fry with brown rice": "tofu-based",
    "tofu and vegetable scramble with whole wheat toast": "tofu-based",
    "tofu omelet with spinach": "tofu-based",
    "tofu breakfast burrito with whole-wheat tortilla": "tofu-based",
    "tofu scramble with spinach and tomato": "tofu-based",
    "tofu scramble with vegetables and whole wheat toast": "tofu-based",
    "tofu scramble with avocado and whole-wheat toast": "tofu-based",
    "tofu scramble with whole wheat toast": "tofu-based",
    "vegan breakfast burrito with tofu and vegetables": "tofu-based",
    "tofu scramble with avocado and whole wheat toast": "tofu-based",
    "tofu breakfast burrito with avocado": "tofu-based",
    "tofu scramble with avocado toast": "tofu-based",
    "tofu and vegetable scramble with wholegrain toast": "tofu-based",
    "tofu and vegetable scramble with avocado toast": "tofu-based",
    "tofu scramble with veggies and avocado toast": "tofu-based",
    "tofu scramble with vegan toast and avocado": "tofu-based",
    "tofu breakfast burrito": "tofu-based",
    "tofu and vegetable scramble": "tofu-based",
    "tofu scramble with spinach and mushrooms": "tofu-based",
    "tofu scramble with veggies and whole-wheat toast": "tofu-based",
    "scrambled tofu with whole-wheat toast and avocado": "tofu-based",
    "tofu scramble with whole-wheat toast and vegetables": "tofu-based",
    "tofu scramble with whole wheat toast and fruit": "tofu-based",
    "tofu and vegetable scramble with whole-wheat toast": "tofu-based",
    "tofu scramble with vegetables and avocado toast": "tofu-based",
    "tofu scramble with whole grain toast and avocado": "tofu-based",
    "scrambled tofu with spinach and tomato": "tofu-based",
    "tofu scramble with whole-wheat toast and avocado" : "tofu-based",

    # Egg-based
    "scrambled eggs with whole wheat toast and avocado": "egg-based",
    "scrambled eggs with whole-wheat toast": "egg-based",
    "scrambled eggs with avocado and whole-wheat toast": "egg-based",
    "scrambled eggs with whole-wheat toast and fruit": "egg-based",
    "scrambled eggs with bacon and whole-wheat toast": "egg-based",
    "scrambled eggs with vegetables and whole-wheat toast": "egg-based",
    "scrambled eggs with whole wheat toast": "egg-based",
    "scrambled eggs with spinach and whole wheat toast": "egg-based",
    "scrambled eggs with whole wheat toast and smoked salmon": "egg-based",
    "scrambled eggs with whole-wheat toast and fruit": "egg-based",
    "scrambled eggs with whole grain toast": "egg-based",
    "eggs with wholegrain toast": "egg-based",
    "eggs with whole-wheat toast": "egg-based",
    "eggs with whole-wheat toast and avocado": "egg-based",
    "eggs with whole-wheat toast and bacon": "egg-based",
    "eggs with wholegrain toast and avocado": "egg-based",
    "eggs with whole wheat toast and avocado" :"egg-based",
    "eggs with whole wheat toast": "egg-based",
    "eggs with whole wheat toast and fruit": "egg-based",
    "eggs with whole grain toast": "egg-based",
    "whole-wheat toast with egg and avocado": "egg-based",
    "3 eggs with whole-wheat toast and avocado": "egg-based",
    "scrambled eggs with whole wheat toast and fruit": "egg-based",
    "scrambled eggs with whole-wheat toast and avocado": "egg-based",
    "eggs with whole-wheat toast and fruit": "egg-based",
    "scrambled eggs with whole wheat toast and spinach": "egg-based",

    # Yogurt-based
    "greek yogurt with granola and fruit": "yogurt-based",
    "greek yogurt with granola": "yogurt-based",
    "greek yogurt with berries and granola": "yogurt-based",
    "greek yogurt with fruit and granola": "yogurt-based",
    "greek yogurt with berries and nuts": "yogurt-based",
    "greek yogurt with berries and almonds": "yogurt-based",
    "greek yogurt with protein powder and fruit": "yogurt-based",
    "yogurt with fruit and granola": "yogurt-based",
    "yogurt parfait with granola and fruit": "yogurt-based",
    "yogurt with berries and granola": "yogurt-based",
    "fruit and yogurt parfait": "yogurt-based",
    "greek yogurt with granola and berries": "yogurt-based",
    "greek yogurt with berries and nuts": "yogurt-based",
    "yogurt with granola and fruit": "yogurt-based",

    # Pancake-based
    "protein pancakes with fruit and nuts": "pancake-based",
    "pancakes with fruit and syrup": "pancake-based",
    "protein pancakes with fruit": "pancake-based",
    "pancakes with fruit and nuts": "pancake-based",
    "protein pancakes with fruit and syrup": "pancake-based",

    # Other (including quinoa-based, toast-based, smoothie-based, burrito-based)
    "quinoa porridge with fruit and nuts": "other",
    "tofu and vegetable stir-fry with quinoa": "other",
    "quinoa porridge with berries and nuts": "other",
    "quinoa porridge with berries": "other",
    "quinoa breakfast bowl with berries and nuts": "other",
    "wholegrain toast with avocado": "other",
    "protein smoothie with fruit and spinach": "other",
    "smoothie with fruit and protein powder": "other",
    "breakfast burrito with eggs and vegetables": "other",
    "breakfast burrito with beans and vegetables": "other",
    "breakfast burrito with beans and veggies": "other",
    "fruit salad with yogurt": "other",
}
df["Breakfast Category"] = df["Breakfast Suggestion"].map(breakfast_mapping_fixed).fillna("other")
print("\nFixed Breakfast Category Value Counts:")
print(df["Breakfast Category"].value_counts())
print("\nBreakfast Suggestions in 'other' Category:")
print(df[df["Breakfast Category"] == "other"]["Breakfast Suggestion"].unique())


Fixed Breakfast Category Value Counts:
Breakfast Category
tofu-based       176
oatmeal-based    121
yogurt-based      93
egg-based         85
other             14
pancake-based      9
Name: count, dtype: int64

Breakfast Suggestions in 'other' Category:
['wholegrain toast with avocado' 'quinoa porridge with fruit and nuts'
 'protein smoothie with fruit and spinach'
 'quinoa porridge with berries and nuts'
 'breakfast burrito with eggs and vegetables'
 'smoothie with fruit and protein powder'
 'breakfast burrito with beans and vegetables'
 'quinoa breakfast bowl with berries and nuts'
 'tofu and vegetable stir-fry with quinoa' 'quinoa porridge with berries'
 'fruit salad with yogurt' 'breakfast burrito with beans and veggies']


In [5]:
breakfast_mapping_final = breakfast_mapping_fixed.copy()  # Use the previous mapping
# Update to merge quinoa-based and toast-based into other
for key in breakfast_mapping_fixed:
    if breakfast_mapping_fixed[key] in ["quinoa-based", "toast-based"]:
        breakfast_mapping_final[key] = "other"

# Apply the final mapping
df["Breakfast Category"] = df["Breakfast Suggestion"].map(breakfast_mapping_final).fillna("other")
print("\nFinal Breakfast Category Value Counts:")
print(df["Breakfast Category"].value_counts())


print("\nBreakfast Suggestions in 'other' Category:")
print(df[df["Breakfast Category"] == "other"]["Breakfast Suggestion"].unique())


Final Breakfast Category Value Counts:
Breakfast Category
tofu-based       176
oatmeal-based    121
yogurt-based      93
egg-based         85
other             14
pancake-based      9
Name: count, dtype: int64

Breakfast Suggestions in 'other' Category:
['wholegrain toast with avocado' 'quinoa porridge with fruit and nuts'
 'protein smoothie with fruit and spinach'
 'quinoa porridge with berries and nuts'
 'breakfast burrito with eggs and vegetables'
 'smoothie with fruit and protein powder'
 'breakfast burrito with beans and vegetables'
 'quinoa breakfast bowl with berries and nuts'
 'tofu and vegetable stir-fry with quinoa' 'quinoa porridge with berries'
 'fruit salad with yogurt' 'breakfast burrito with beans and veggies']


In [6]:
print("\nSummary of Numeric Features:")
print(df[["Age", "Height", "Weight"]].describe())
print("\nUnique Values in Categorical Features:")
print("Gender:", df["Gender"].unique())
print("Activity Level:", df["Activity Level"].unique())
print("Fitness Goal:", df["Fitness Goal"].unique())
print("Dietary Preference:", df["Dietary Preference"].unique())
print("\nNumber of Duplicated Rows:", df.duplicated().sum())


Summary of Numeric Features:
              Age      Height      Weight
count  498.000000  498.000000  498.000000
mean    41.210843  172.636546   76.287149
std     15.151207    9.686973   14.692416
min     19.000000  155.000000   48.000000
25%     28.000000  165.000000   65.000000
50%     38.000000  170.000000   75.000000
75%     55.000000  180.000000   90.000000
max     72.000000  190.000000  110.000000

Unique Values in Categorical Features:
Gender: ['Male' 'Female']
Activity Level: ['Moderately Active' 'Lightly Active' 'Sedentary' 'Very Active']
Fitness Goal: ['Weight Loss' 'Maintenance' 'Muscle Gain' 'Weight Maintenance']
Dietary Preference: ['Omnivore' 'Vegetarian' 'Vegan']

Number of Duplicated Rows: 0


In [13]:
# Step 8: Encode categorical features
# Label encode Gender
le_gender = LabelEncoder()
df["Gender"] = le_gender.fit_transform(df["Gender"])

# One-hot encode other categorical features
df = pd.get_dummies(df, columns=["Activity Level", "Fitness Goal", "Dietary Preference"], prefix=["Activity", "Goal", "Diet"])

# Label encode the target (Breakfast Category)
le_breakfast = LabelEncoder()
df["Breakfast Category Encoded"] = le_breakfast.fit_transform(df["Breakfast Category"])

In [15]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,Daily Calorie Target,Protein,Carbohydrates,Fat,Breakfast Suggestion,Lunch Suggestion,...,Activity_Sedentary,Activity_Very Active,Goal_Maintenance,Goal_Muscle Gain,Goal_Weight Loss,Goal_Weight Maintenance,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,Breakfast Category Encoded
0,25.0,1,180.0,80.0,2000,120,250,60,oatmeal with berries and nuts,Grilled chicken salad with mixed greens,...,False,False,False,False,True,False,True,False,False,1
1,32.0,0,165.0,65.0,1600,80,200,40,tofu scramble with veggies,Lentil soup with whole wheat bread,...,False,False,False,False,True,False,False,False,True,4
2,48.0,1,175.0,95.0,2200,100,300,65,tofu and veggie breakfast burrito,Black bean burger on a whole wheat bun,...,True,False,True,False,False,False,False,True,False,4
3,55.0,0,160.0,70.0,2500,140,350,80,greek yogurt with granola and fruit,Chicken and vegetable stir-fry,...,False,True,False,False,True,False,True,False,False,5
4,62.0,1,170.0,85.0,2000,80,250,55,scrambled eggs with whole wheat toast and avocado,Quinoa salad with chickpeas and vegetables,...,True,False,True,False,False,False,False,False,True,0


In [17]:
# Step 9: Save the cleaned dataset
df.to_csv("nutrition_dataset_cleaned.csv", index=False)
print("\nCleaned dataset saved as 'nutrition_dataset_cleaned.csv'")


Cleaned dataset saved as 'nutrition_dataset_cleaned.csv'
