In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load recipe dataset
df = pd.read_csv("../meal_planner/model_recipe.csv")

# Check for required columns
required_columns = {"meal_type", "ingredient_names", "calories", "protein", "carbohydrates", "fat"}
missing_columns = required_columns - set(df.columns)
if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

# One-hot encode meal type
encoder = OneHotEncoder(sparse_output=False)  
meal_types_encoded = encoder.fit_transform(df[['meal_type']])

# Process ingredients (Fix NaN values)
df['ingredient_names'] = df['ingredient_names'].fillna("").astype(str)
df['ingredient_names'] = df['ingredient_names'].apply(lambda x: x.split(','))

# Create ingredient bag-of-words
all_ingredients = set(ing for sublist in df['ingredient_names'] for ing in sublist)
ingredient_df = pd.DataFrame([{ing: 1 if ing in row else 0 for ing in all_ingredients} for row in df['ingredient_names']]).reset_index(drop=True)

# Normalize nutrition info
scaler = StandardScaler()
nutrition_features = scaler.fit_transform(df[['calories', 'protein', 'carbohydrates', 'fat']])

# Combine all features
recipe_features = pd.concat([
    pd.DataFrame(meal_types_encoded), 
    ingredient_df, 
    pd.DataFrame(nutrition_features)
], axis=1).reset_index(drop=True)

print("Preprocessed Data Shape:", recipe_features.shape)


Preprocessed Data Shape: (341, 1928)


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load recipe dataset
def load_data(filepath):
    """Load the dataset and ensure it has the required columns."""
    df = pd.read_csv(filepath)
    
    # Clean column names (remove extra spaces)
    df.columns = df.columns.str.strip()
    
    # Ensure required columns exist
    required_columns = {"meal_type", "ingredient_names", "calories", "protein", "carbohydrates", "fat"}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in dataset: {missing_columns}")
    
    return df 

# Filter recipes based on user preferences
def filter_recipes(df, user_preferences):
    """Filter recipes based on meal type, calorie range, and ingredient preferences."""
    # Ensure 'ingredient_names' column exists
    if 'ingredient_names' not in df.columns:
        raise ValueError("The column 'ingredient_names' is missing from the dataset.")
    
    # Filter meal types
    df = df[df["meal_type"].isin(user_preferences["meal_types"])]
    
    # Filter calorie range
    df = df[(df["calories"] >= user_preferences["calorie_range"][0]) & 
            (df["calories"] <= user_preferences["calorie_range"][1])]
    
    # Convert ingredient_names column into lists safely
    df['ingredient_names'] = df['ingredient_names'].fillna("").astype(str).apply(lambda x: x.split(',') if x else [])
    
    # Filter based on preferred ingredients
    if user_preferences["preferred_ingredients"]:
        df = df[df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["preferred_ingredients"]))]
    
    # Filter out disliked ingredients
    if user_preferences["disliked_ingredients"]:
        df = df[~df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["disliked_ingredients"]))]
    
    # Reset index after filtering
    df = df.reset_index(drop=True)
    
    print(f"Filtered recipes count: {len(df)}")
    return df

# Preprocess data for clustering
def preprocess_data(df):
    """Preprocess the data for clustering by encoding and scaling features."""
    # One-hot encode meal type
    encoder = OneHotEncoder(sparse_output=False)  
    meal_types_encoded = encoder.fit_transform(df[['meal_type']])
    meal_types_encoded_df = pd.DataFrame(meal_types_encoded, columns=encoder.get_feature_names_out(['meal_type']))
    
    # Process ingredients (Convert list of ingredients to a bag-of-words format)
    all_ingredients = set(ing for sublist in df['ingredient_names'] for ing in sublist)
    ingredient_df = pd.DataFrame([{ing: 1 if ing in row else 0 for ing in all_ingredients} for row in df['ingredient_names']]).reset_index(drop=True)
    
    # Normalize nutrition info
    scaler = StandardScaler()
    nutrition_features = scaler.fit_transform(df[['calories', 'protein', 'carbohydrates', 'fat']])
    nutrition_features_df = pd.DataFrame(nutrition_features, columns=['calories', 'protein', 'carbohydrates', 'fat'])
    
    # Combine all features
    recipe_features = pd.concat([
        meal_types_encoded_df, 
        ingredient_df, 
        nutrition_features_df
    ], axis=1).reset_index(drop=True)
    
    print("Preprocessed Data Shape (after filtering):", recipe_features.shape)
    return recipe_features

# Main function
def main():
    # --- User Preferences ---
    user_preferences = {
        "meal_types": ["breakfast", "lunch"],  
        "preferred_ingredients": {"banana", "oats", "chicken"},  
        "disliked_ingredients": {"peanuts", "milk"},  
        "calorie_range": (200, 600)  
    }
    
    # Load data
    try:
        df = load_data("../meal_planner/model_recipe.csv")
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    # Filter recipes
    try:
        filtered_df = filter_recipes(df, user_preferences)
    except Exception as e:
        print(f"Error filtering recipes: {e}")
        return
    
    # Preprocess data
    try:
        preprocessed_data = preprocess_data(filtered_df)
    except Exception as e:
        print(f"Error preprocessing data: {e}")
        return
    
    # Output preprocessed data
    print("Preprocessed Data Sample:")
    print(preprocessed_data.head())

# Run the program
if __name__ == "__main__":
    main()

Error filtering recipes: 'ingredient_names'


In [21]:
print(df.columns)

Index([], dtype='object')
