In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load recipe dataset
def load_data(filepath):
    """Load the dataset and ensure it has the required columns."""
    df = pd.read_csv(filepath)
    
    # Clean column names (remove extra spaces)
    df.columns = df.columns.str.strip()
    
    # Ensure required columns exist
    required_columns = {"meal_type", "ingredient_names", "calories", "protein", "carbohydrates", "fat"}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in dataset: {missing_columns}")
    
    # Debug: Print column names and a sample of the data
    print("Columns in the dataset:", df.columns)
    print("Sample data:")
    print(df.head())
    
    return df

# Convert ingredient_names string to a proper list
def clean_ingredient_names(ingredient_names):
    """Convert the ingredient_names string into a proper Python list."""
    if pd.isna(ingredient_names):  # Handle NaN values
        return []
    
    try:
        # Ensure the input is a string
        ingredient_names = str(ingredient_names)
        
        # Remove square brackets and quotes, then split by comma
        ingredients = ingredient_names.strip("[]").replace("'", "").split(", ")
        return [ingredient.strip() for ingredient in ingredients]
    except Exception as e:
        print(f"Error cleaning ingredient_names: {e}")
        return []

# Filter recipes based on user preferences
def filter_recipes(df, user_preferences):
    """Filter recipes based on meal type, calorie range, and ingredient preferences."""
    # Ensure 'ingredient_names' column exists
    if 'ingredient_names' not in df.columns:
        raise ValueError("The column 'ingredient_names' is missing from the dataset.")
    
    # Debug: Print the column before processing
    print("ingredient_names column before processing:")
    print(df['ingredient_names'].head())
    
    # Convert ingredient_names column into proper lists
    df['ingredient_names'] = df['ingredient_names'].apply(clean_ingredient_names)
    
    # Debug: Print the column after processing
    print("ingredient_names column after processing:")
    print(df['ingredient_names'].head())
    
    # Filter meal types
    df = df[df["meal_type"].isin(user_preferences["meal_types"])]
    
    # Filter calorie range
    df = df[(df["calories"] >= user_preferences["calorie_range"][0]) & 
            (df["calories"] <= user_preferences["calorie_range"][1])]
    
    # Filter based on preferred ingredients
    if user_preferences["preferred_ingredients"]:
        df = df[df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["preferred_ingredients"]))]
    
    # Filter out disliked ingredients
    if user_preferences["disliked_ingredients"]:
        df = df[~df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["disliked_ingredients"]))]
    
    # Reset index after filtering
    df = df.reset_index(drop=True)
    
    print(f"Filtered recipes count: {len(df)}")
    return df

# Preprocess data for clustering
def preprocess_data(df):
    """Preprocess the data for clustering by encoding and scaling features."""
    # One-hot encode meal type
    encoder = OneHotEncoder(sparse_output=False)  
    meal_types_encoded = encoder.fit_transform(df[['meal_type']])
    meal_types_encoded_df = pd.DataFrame(meal_types_encoded, columns=encoder.get_feature_names_out(['meal_type']))
    
    # Process ingredients (Convert list of ingredients to a bag-of-words format)
    all_ingredients = set(ing for sublist in df['ingredient_names'] for ing in sublist)
    ingredient_df = pd.DataFrame([{ing: 1 if ing in row else 0 for ing in all_ingredients} for row in df['ingredient_names']]).reset_index(drop=True)
    
    # Normalize nutrition info
    scaler = StandardScaler()
    nutrition_features = scaler.fit_transform(df[['calories', 'protein', 'carbohydrates', 'fat']])
    nutrition_features_df = pd.DataFrame(nutrition_features, columns=['calories', 'protein', 'carbohydrates', 'fat'])
    
    # Combine all features
    recipe_features = pd.concat([
        meal_types_encoded_df, 
        ingredient_df, 
        nutrition_features_df
    ], axis=1).reset_index(drop=True)
    
    print("Preprocessed Data Shape (after filtering):", recipe_features.shape)
    return recipe_features

# Main function
def main():
    # --- User Preferences ---
    user_preferences = {
        "meal_types": ["breakfast", "lunch"],  
        "preferred_ingredients": {"banana", "oats", "chicken"},  
        "disliked_ingredients": {"peanuts", "milk"},  
        "calorie_range": (200, 600)  
    }
    
    # Load data
    try:
        df = load_data("../meal_planner/model_recipe.csv")
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    # Filter recipes
    try:
        filtered_df = filter_recipes(df, user_preferences)
    except Exception as e:
        print(f"Error filtering recipes: {e}")
        return
    
    # Preprocess data
    try:
        preprocessed_data = preprocess_data(filtered_df)
    except Exception as e:
        print(f"Error preprocessing data: {e}")
        return
    
    # Output preprocessed data
    print("Preprocessed Data Sample:")
    print(preprocessed_data.head())

# Run the program
if __name__ == "__main__":
    main()

Columns in the dataset: Index(['calories', 'carbohydrates', 'cuisine', 'fat', 'protein', 'title',
       'protein_type', 'meal_type', 'ingredient_names', 'diet_type',
       'diet_prefrences'],
      dtype='object')
Sample data:
   calories  carbohydrates   cuisine  fat  protein  \
0      98.0            3.0  American  5.0     10.0   
1     146.0            4.0  American  6.0     19.0   
2     291.0           17.0  American  7.0     36.0   
3     357.0           33.0  American  8.0     38.0   
4     260.0           23.0  American  7.0     26.0   

                              title protein_type      meal_type  \
0       Blueberry Breakfast Sausage           []  ['breakfast']   
1  Greek Yogurt Buffalo Chicken Dip  ['chicken']      ['snack']   
2        Air Fryer Teriyaki Chicken  ['chicken']     ['dinner']   
3    Instant Pot Chicken Stroganoff  ['chicken']     ['dinner']   
4                       Cowboy Soup     ['beef']     ['dinner']   

                                    ingredi

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load recipe dataset
def load_data(filepath):
    """Load the dataset and ensure it has the required columns."""
    df = pd.read_csv(filepath)
    
    # Clean column names (remove extra spaces)
    df.columns = df.columns.str.strip()
    
    # Ensure required columns exist
    required_columns = {"meal_type", "ingredient_names", "calories", "protein", "carbohydrates", "fat"}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in dataset: {missing_columns}")
    
    # Debug: Print column names and a sample of the data
    print("Columns in the dataset:", df.columns)
    print("Sample data:")
    print(df.head())
    
    return df

# Convert ingredient_names string to a proper list
def clean_ingredient_names(ingredient_names):
    """Convert the ingredient_names string into a proper Python list."""
    if pd.isna(ingredient_names):  # Handle NaN values
        return []
    
    try:
        # Ensure the input is a string
        ingredient_names = str(ingredient_names)
        
        # Remove square brackets and quotes, then split by comma
        ingredients = ingredient_names.strip("[]").replace("'", "").split(", ")
        return [ingredient.strip() for ingredient in ingredients]
    except Exception as e:
        print(f"Error cleaning ingredient_names: {e}")
        return []

# Filter recipes based on user preferences
def filter_recipes(df, user_preferences):
    """Filter recipes based on meal type, calorie range, and ingredient preferences."""
    # Ensure 'ingredient_names' column exists
    if 'ingredient_names' not in df.columns:
        raise ValueError("The column 'ingredient_names' is missing from the dataset.")
    
    # Debug: Print the column before processing
    print("ingredient_names column before processing:")
    print(df['ingredient_names'].head())
    
    # Convert ingredient_names column into proper lists
    df['ingredient_names'] = df['ingredient_names'].apply(clean_ingredient_names)
    
    # Debug: Print the column after processing
    print("ingredient_names column after processing:")
    print(df['ingredient_names'].head())
    
    # Filter meal types
    df = df[df["meal_type"].apply(lambda x: x[0] if isinstance(x, list) else x).isin(user_preferences["meal_types"])]
    
    # Filter calorie range
    df = df[(df["calories"] >= user_preferences["calorie_range"][0]) & 
            (df["calories"] <= user_preferences["calorie_range"][1])]
    
    # Debug: Print DataFrame after meal type and calorie filtering
    print("DataFrame after meal type and calorie filtering:")
    print(df.head())
    
    # Filter based on preferred ingredients
    if user_preferences["preferred_ingredients"]:
        df = df[df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["preferred_ingredients"]))]
    
    # Debug: Print DataFrame after preferred ingredients filtering
    print("DataFrame after preferred ingredients filtering:")
    print(df.head())
    
    # Filter out disliked ingredients
    if user_preferences["disliked_ingredients"]:
        df = df[~df['ingredient_names'].apply(lambda ing_list: bool(set(ing_list) & user_preferences["disliked_ingredients"]))]
    
    # Debug: Print DataFrame after disliked ingredients filtering
    print("DataFrame after disliked ingredients filtering:")
    print(df.head())
    
    # Reset index after filtering
    df = df.reset_index(drop=True)
    
    print(f"Filtered recipes count: {len(df)}")
    return df

# Preprocess data for clustering
def preprocess_data(df):
    """Preprocess the data for clustering by encoding and scaling features."""
    # One-hot encode meal type
    encoder = OneHotEncoder(sparse_output=False)  
    meal_types_encoded = encoder.fit_transform(df[['meal_type']])
    meal_types_encoded_df = pd.DataFrame(meal_types_encoded, columns=encoder.get_feature_names_out(['meal_type']))
    
    # Process ingredients (Convert list of ingredients to a bag-of-words format)
    all_ingredients = set(ing for sublist in df['ingredient_names'] for ing in sublist)
    ingredient_df = pd.DataFrame([{ing: 1 if ing in row else 0 for ing in all_ingredients} for row in df['ingredient_names']]).reset_index(drop=True)
    
    # Normalize nutrition info
    scaler = StandardScaler()
    nutrition_features = scaler.fit_transform(df[['calories', 'protein', 'carbohydrates', 'fat']])
    nutrition_features_df = pd.DataFrame(nutrition_features, columns=['calories', 'protein', 'carbohydrates', 'fat'])
    
    # Combine all features
    recipe_features = pd.concat([
        meal_types_encoded_df, 
        ingredient_df, 
        nutrition_features_df
    ], axis=1).reset_index(drop=True)
    
    print("Preprocessed Data Shape (after filtering):", recipe_features.shape)
    return recipe_features

# Main function
def main():
    # --- User Preferences ---
    user_preferences = {
        "meal_types": ["breakfast", "lunch"],  
        "preferred_ingredients": {"banana", "oats", "chicken"},  
        "disliked_ingredients": {"peanuts", "milk"},  
        "calorie_range": (200, 600)  
    }
    
    # Load data
    try:
        df = load_data("../meal_planner/model_recipe.csv")
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    # Filter recipes
    try:
        filtered_df = filter_recipes(df, user_preferences)
    except Exception as e:
        print(f"Error filtering recipes: {e}")
        return
    
    # Preprocess data
    try:
        preprocessed_data = preprocess_data(filtered_df)
    except Exception as e:
        print(f"Error preprocessing data: {e}")
        return
    
    # Output preprocessed data
    print("Preprocessed Data Sample:")
    print(preprocessed_data.head())

# Run the program
if __name__ == "__main__":
    main()

Columns in the dataset: Index(['calories', 'carbohydrates', 'cuisine', 'fat', 'protein', 'title',
       'protein_type', 'meal_type', 'ingredient_names', 'diet_type',
       'diet_prefrences'],
      dtype='object')
Sample data:
   calories  carbohydrates   cuisine  fat  protein  \
0      98.0            3.0  American  5.0     10.0   
1     146.0            4.0  American  6.0     19.0   
2     291.0           17.0  American  7.0     36.0   
3     357.0           33.0  American  8.0     38.0   
4     260.0           23.0  American  7.0     26.0   

                              title protein_type      meal_type  \
0       Blueberry Breakfast Sausage           []  ['breakfast']   
1  Greek Yogurt Buffalo Chicken Dip  ['chicken']      ['snack']   
2        Air Fryer Teriyaki Chicken  ['chicken']     ['dinner']   
3    Instant Pot Chicken Stroganoff  ['chicken']     ['dinner']   
4                       Cowboy Soup     ['beef']     ['dinner']   

                                    ingredi

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def load_data(filepath):
    """Load and validate the dataset."""
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    
    required_columns = {"meal_type", "ingredient_names", "calories", "protein", "carbohydrates", "fat"}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns: {missing_columns}")
    
    # Clean meal_type and ingredient_names
    df['meal_type'] = df['meal_type'].apply(clean_ingredient_names)
    df['ingredient_names'] = df['ingredient_names'].apply(clean_ingredient_names)
    
    return df

def clean_ingredient_names(value):
    """Convert string representations of lists to actual lists."""
    if pd.isna(value):
        return []
    try:
        value = str(value).strip("[]").replace("'", "")
        return [item.strip() for item in value.split(", ") if item.strip()]
    except:
        return []

def filter_recipes(df, user_prefs):
    """Filter recipes based on user preferences."""
    if df.empty:
        return pd.DataFrame()
    
    # Extract first meal_type value (e.g., "breakfast" from ["breakfast"])
    df['meal_type_clean'] = df['meal_type'].apply(lambda x: x[0] if x else None)
    
    # Filter meal types
    df = df[df['meal_type_clean'].isin(user_prefs["meal_types"])]
    
    # Filter calories
    df = df[(df['calories'] >= user_prefs["calorie_range"][0]) & 
            (df['calories'] <= user_prefs["calorie_range"][1])]
    
    # Filter ingredients only if DataFrame is not empty
    if not df.empty:
        if user_prefs["preferred_ingredients"]:
            df = df[df['ingredient_names'].apply(
                lambda x: bool(set(x) & user_prefs["preferred_ingredients"])
            )]
        if user_prefs["disliked_ingredients"]:
            df = df[~df['ingredient_names'].apply(
                lambda x: bool(set(x) & user_prefs["disliked_ingredients"])
            )]
    
    return df.reset_index(drop=True)

def main():
    user_prefs = {
        "meal_types": ["breakfast", "lunch"],
        "preferred_ingredients": {"banana", "oats", "chicken"},
        "disliked_ingredients": {"peanuts", "milk"},
        "calorie_range": (200, 600)
    }
    
    try:
        df = load_data("../meal_planner/model_recipe.csv")
        filtered_df = filter_recipes(df, user_prefs)
        
        if filtered_df.empty:
            print("No recipes match your filters.")
            return
            
        # Preprocessing and further steps here
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

No recipes match your filters.


In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def load_data(filepath):
    """Load and preprocess the dataset."""
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    
    # Clean list-like columns
    list_columns = ['meal_type', 'ingredient_names', 'diet_prefrences']
    for col in list_columns:
        df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else [])
    
    return df

def flexible_filter(df, user_prefs, calorie_step=100, max_expansions=3):
    """Flexible filtering with fallback mechanisms."""
    original_df = df.copy()
    expansions = 0
    
    while expansions <= max_expansions:
        # Temporary calorie range expansion
        expanded_min = max(0, user_prefs["calorie_range"][0] - (expansions * calorie_step))
        expanded_max = user_prefs["calorie_range"][1] + (expansions * calorie_step)
        
        # Base filters
        filtered = df[
            (df['calories'] >= expanded_min) &
            (df['calories'] <= expanded_max) &
            (~df['ingredient_names'].apply(lambda x: any(ing in user_prefs["disliked_ingredients"] for ing in x)))
            ]
        
        # Meal type flexibility
        if not filtered.empty:
            filtered['meal_type_match'] = filtered['meal_type'].apply(
                lambda x: any(mt in user_prefs["meal_types"] for mt in x)
            )
            filtered = filtered.sort_values('meal_type_match', ascending=False)
        
        # Preferred ingredients scoring
        if user_prefs["preferred_ingredients"]:
            filtered['pref_score'] = filtered['ingredient_names'].apply(
                lambda x: len(set(x) & user_prefs["preferred_ingredients"])
            )
            filtered = filtered.sort_values('pref_score', ascending=False)
        
        if not filtered.empty:
            return filtered.head(10)  # Return top 10 matches
        
        expansions += 1
    
    # Fallback: Return best matches from original data
    fallback = original_df[
        ~original_df['ingredient_names'].apply(lambda x: any(ing in user_prefs["disliked_ingredients"] for ing in x))
    ]
    return fallback.sample(min(5, len(fallback)))  # Return random 5 safe options

def main():
    user_prefs = {
        "meal_types": ["breakfast", "lunch"],
        "preferred_ingredients": {"banana", "oats", "chicken"},
        "disliked_ingredients": {"peanuts", "milk"},
        "calorie_range": (200, 600)
    }
    
    try:
        df = load_data("../meal_planner/model_recipe.csv")
        results = flexible_filter(df, user_prefs)
        
        if not results.empty:
            print(f"Found {len(results)} matching recipes:")
            print(results[['title', 'calories', 'meal_type', 'ingredient_names']])
        else:
            print("No perfect matches found. Try adjusting your preferences.")
            
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Found 10 matching recipes:
                               title  calories               meal_type  \
145              Ground Turkey Bowls     418.0         [dinner, lunch]   
140  Chicken Teriyaki Pineapple Bowl     329.0         [dinner, lunch]   
170   Philly Cheesesteak Sloppy Joes     385.0         [dinner, lunch]   
163         Grilled Teriyaki Chicken     209.0         [dinner, lunch]   
161                 Tuna Pasta Salad     218.0  [dinner, lunch, snack]   
159             Instant Pot Carnitas     296.0         [dinner, lunch]   
156   Brioche French Toast Casserole     275.0             [breakfast]   
152      Buffalo Ranch Chicken Salad     237.0                 [lunch]   
149         High Protein Pasta Salad     245.0  [dinner, lunch, snack]   
147               Acai Protein Bowls     280.0             [breakfast]   

                                      ingredient_names  
145  [lean ground turkey, chili powder, smoked papr...  
140  [boneless skinless chicken breast, garl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['meal_type_match'] = filtered['meal_type'].apply(


In [12]:
def cluster_recipes(filtered_df):
    """Cluster recipes into meal categories without duplicates"""
    clusters = {
        'breakfast': [],
        'lunch': [],
        'dinner': [],
        'snack': []
    }

    # Create a copy to avoid SettingWithCopyWarning
    df = filtered_df.copy()
    
    # Explode meal_type lists into individual rows
    df['meal_type'] = df['meal_type'].apply(eval)  # Convert string lists to actual lists
    exploded_df = df.explode('meal_type')

    # Filter and group by valid meal types
    valid_meals = clusters.keys()
    grouped = exploded_df[exploded_df['meal_type'].isin(valid_meals)] \
        .groupby('meal_type', observed=True)

    # Build clusters with original recipe format
    for meal_type, group in grouped:
        clusters[meal_type] = group.drop_duplicates('title') \
                                  .sort_values('calories') \
                                  .to_dict('records')

    return clusters

# Usage
clustered_recipes = cluster_recipes(filterd_df)

# Pretty print results
for meal_type, recipes in clustered_recipes.items():
    print(f"\n=== {meal_type.upper()} ===")
    for recipe in recipes:
        print(f"\n{recipe['title']} ({recipe['calories']} cal)")
        print(f"Meal Types: {', '.join(eval(recipe['meal_type']))}")
        print(f"Ingredients: {', '.join(eval(recipe['ingredient_names']))}")

NameError: name 'filterd_df' is not defined

In [None]:
# PROMISSING
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def load_data(filepath):
    """Load and preprocess the dataset."""
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    
    # Clean list-like columns
    list_columns = ['meal_type', 'ingredient_names', 'diet_prefrences']
    for col in list_columns:
        df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else [])
    
    return df

def flexible_filter(df, user_prefs, calorie_step=100, max_expansions=3):
    """Flexible filtering with fallback mechanisms."""
    original_df = df.copy()
    expansions = 0
     
    while expansions <= max_expansions:
        # Temporary calorie range expansion
        expanded_min = max(0, user_prefs["calorie_range"][0] - (expansions * calorie_step))
        expanded_max = user_prefs["calorie_range"][1] + (expansions * calorie_step)
        
        # Base filters
        filtered = df[
            (df['calories'] >= expanded_min) &
            (df['calories'] <= expanded_max) &
            (~df['ingredient_names'].apply(lambda x: any(ing in user_prefs["disliked_ingredients"] for ing in x)))
        ]
        
        # Meal type flexibility
        if not filtered.empty:
            filtered['meal_type_match'] = filtered['meal_type'].apply(
                lambda x: any(mt in user_prefs["meal_types"] for mt in x)
            )
            filtered = filtered.sort_values('meal_type_match', ascending=False)
        
        # Preferred ingredients scoring
        if user_prefs["preferred_ingredients"]:
            filtered['pref_score'] = filtered['ingredient_names'].apply(
                lambda x: len(set(x) & user_prefs["preferred_ingredients"])
            )
            filtered = filtered.sort_values('pref_score', ascending=False)
        
        if not filtered.empty:
            return filtered.head(10)
        
        expansions += 1
    
    # Fallback
    fallback = original_df[
        ~original_df['ingredient_names'].apply(lambda x: any(ing in user_prefs["disliked_ingredients"] for ing in x))
    ]
    return fallback.sample(min(5, len(fallback)))

def cluster_recipes(filtered_df):
    """Cluster recipes into meal categories without duplicates"""
    clusters = {
        'breakfast': [],
        'lunch': [],
        'dinner': [],
        'snack': []
    }

    # Create a copy to avoid SettingWithCopyWarning
    df = filtered_df.copy()
    
    # Explode meal_type lists into individual rows
    exploded_df = df.explode('meal_type')

    # Filter and group by valid meal types
    valid_meals = clusters.keys()
    grouped = exploded_df[exploded_df['meal_type'].isin(valid_meals)] \
        .groupby('meal_type', observed=True)

    # Build clusters with original recipe format
    for meal_type, group in grouped:
        clusters[meal_type] = group.drop_duplicates('title') \
                                  .sort_values('calories') \
                                  .to_dict('records')

    return clusters

def main():
    user_prefs = {
        "meal_types": ["breakfast", "lunch"],
        "preferred_ingredients": {"banana", "oats", "chicken", "egg"},
        "disliked_ingredients": {"peanuts", "milk"},
        "calorie_range": (200, 600)
    }
    
    try:
        df = load_data("../meal_planner/model_recipe.csv")
        results = flexible_filter(df, user_prefs)
        
        if not results.empty:
            print(f"Found {len(results)} matching recipes:")
            print(results[['title', 'calories', 'meal_type', 'ingredient_names']])
            
            # Cluster the results
            clustered_recipes = cluster_recipes(results)
            
            # Print clustered results
            print("\n=== CLUSTERED RECIPES ===")
            for meal_type, recipes in clustered_recipes.items():
                print(f"\n=== {meal_type.upper()} ===")
                for recipe in recipes:
                    print(f"\n{recipe['title']} ({recipe['calories']} cal)")
                    print(f"Meal Types: {', '.join(recipe['meal_type'])}")
                    print(f"Ingredients: {', '.join(recipe['ingredient_names'])}")
        else:
            print("No perfect matches found. Try adjusting your preferences.")
            
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Found 10 matching recipes:
                                 title  calories        meal_type  \
138  Air Fryer Buffalo Chicken Tenders     221.0  [dinner, lunch]   
236               Chocolate Baked Oats     302.0      [breakfast]   
210                Cajun Meatball Stew     286.0         [dinner]   
295                S’mores Cookie Bars     218.0       [desserts]   
151             Instant Pot Fried Rice     200.0  [dinner, snack]   
251             Peppermint Cheesecakes     202.0       [desserts]   
333             Applesauce Coffee Cake     212.0       [desserts]   
44                Thai Chicken Noodles     321.0         [dinner]   
152        Buffalo Ranch Chicken Salad     237.0          [lunch]   
156     Brioche French Toast Casserole     275.0      [breakfast]   

                                      ingredient_names  
138  [chicken breast tenders , Italian breadcrumbs ...  
236  [old fashioned rolled oats, cocoa powder, choc...  
210  [smoked paprika, salt, garlic powder,

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

def load_data(filepath):
    """Load and preprocess the dataset."""
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()

    # Convert list-like columns from string to list
    list_columns = ['meal_type', 'ingredient_names', 'diet_prefrences']
    for col in list_columns:
        df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else [])

    return df

def handle_missing_values(df):
    """Fill or drop missing values before clustering."""
    
    # Fill missing nutritional values with the column mean
    imputer = SimpleImputer(strategy='mean')
    df[['calories', 'protein', 'carbohydrates', 'fat']] = imputer.fit_transform(df[['calories', 'protein', 'carbohydrates', 'fat']])
    
    return df

def cluster_recipes_kmeans(df, n_clusters=4):
    """Cluster recipes using K-Means based on nutritional values."""
    
    # Handle missing values
    df = handle_missing_values(df)

    # Selecting features for clustering
    features = df[['calories', 'protein', 'carbohydrates', 'fat']].copy()

    # Standardize features for better clustering
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(features_scaled)

    # Organize clustered recipes
    clustered_recipes = {}
    for cluster in range(n_clusters):
        clustered_recipes[f"Cluster {cluster+1}"] = df[df['cluster'] == cluster].to_dict('records')

    return clustered_recipes

def main():
    user_prefs = {
        "meal_types": ["breakfast", "lunch"],
        "preferred_ingredients": {"banana", "oats", "chicken"},
        "disliked_ingredients": {"peanuts", "milk"},
        "calorie_range": (200, 600)
    }

    try:
        df = load_data("../meal_planner/model_recipe.csv")
        filtered_df = df.dropna(subset=['calories', 'protein', 'carbohydrates', 'fat'])  # Drop rows with missing values

        if not filtered_df.empty:
            print(f"Found {len(filtered_df)} matching recipes.")
            
            # Apply K-Means Clustering
            clustered_recipes = cluster_recipes_kmeans(filtered_df, n_clusters=4)

            # Display clusters
            print("\n=== CLUSTERED RECIPES ===")
            for cluster, recipes in clustered_recipes.items():
                print(f"\n=== {cluster} ===")
                for recipe in recipes[:5]:  # Show top 5 per cluster
                    print(f"\n{recipe['title']} ({recipe['calories']} cal)")
                    print(f"Ingredients: {', '.join(recipe['ingredient_names'])}")
                    print(f"Cluster: {recipe['cluster']}")
        else:
            print("No recipes found matching your criteria.")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()


Found 316 matching recipes.

=== CLUSTERED RECIPES ===

=== Cluster 1 ===

Blueberry Breakfast Sausage (98.0 cal)
Ingredients: pork sausage, fresh blueberries, dried thyme, olive oil
Cluster: 0

Greek Yogurt Buffalo Chicken Dip (146.0 cal)
Ingredients: boneless skinless chicken breast , chicken broth , cooked chicken, 1/3 less fat cream cheese, shredded cheddar cheese, , non-fat plain Greek yogurt , Frank's Red Hot Sauce, lite maple syrup , garlic powder, onion powder, green onions
Cluster: 0

Lemon Blondies (114.0 cal)
Ingredients: coconut oil, eggs, vanilla extract , lemon extract , salt , baking powder
Cluster: 0

White Bean Buffalo Dip (120.0 cal)
Ingredients: Great Northern White Beans , 1/3 Less Fat Cream Cheese, Franks Red Hot Sauce , Nonfat Fage Greek Yogurt , garlic, Part Skim Mozzarella Cheese, Cheddar Cheese, green onions 
Cluster: 0

Air Fryer Mini Tacos (148.0 cal)
Ingredients: chicken breast, Yellow Corn Tortillas, Mexican Cheese Blend, salt, onion powder, garlic powder, 

In [20]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def cluster_recipes_kmeans(filtered_df, n_clusters=4):
    """Cluster recipes using K-Means based on nutritional and ingredient features."""
    # Create preprocessing pipeline
    preprocessor = make_column_transformer(
        (TfidfVectorizer(analyzer=lambda x: x, max_features=100), 'ingredient_names'),
        (StandardScaler(), ['calories', 'protein', 'carbohydrates', 'fat']),
        remainder='drop'
    )
    
    # Create clustering pipeline
    cluster_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('cluster', KMeans(n_clusters=n_clusters, random_state=42))
    ])
    
    # Fit and predict clusters
    filtered_df['cluster'] = cluster_pipe.fit_predict(filtered_df)
    
    # Analyze cluster characteristics
    cluster_profiles = filtered_df.groupby('cluster').agg({
        'calories': 'mean',
        'protein': 'mean',
        'carbohydrates': 'mean', 
        'fat': 'mean',
        'ingredient_names': lambda x: x.explode().value_counts().index[:3]
    }).rename(columns={'ingredient_names': 'common_ingredients'})
    
    return filtered_df, cluster_profiles

def describe_clusters(cluster_profiles):
    """Generate human-readable cluster descriptions."""
    descriptions = {}
    for idx, row in cluster_profiles.iterrows():
        main_nutrient = max(['protein', 'carbohydrates', 'fat'], key=lambda x: row[x])
        desc = (f"Cluster {idx}: {row['calories']:.0f} avg calories | "
                f"High in {main_nutrient} | "
                f"Common ingredients: {', '.join(row['common_ingredients'])}")
        descriptions[idx] = desc
    return descriptions

def main():
    user_prefs = {
        "meal_types": ["breakfast", "lunch"],
        "preferred_ingredients": {"banana", "oats", "chicken"},
        "disliked_ingredients": {"peanuts", "milk"},
        "calorie_range": (200, 600)
    }
    
    try:
        df = load_data("../meal_planner/model_recipe.csv")
        results = flexible_filter(df, user_prefs)
        
        if not results.empty:
            print(f"Found {len(results)} matching recipes:")
            print(results[['title', 'calories', 'meal_type', 'ingredient_names']])
            
            # Cluster using K-Means
            clustered_df, cluster_profiles = cluster_recipes_kmeans(results)
            cluster_descs = describe_clusters(cluster_profiles)
            
            print("\n=== DISCOVERED MEAL CLUSTERS ===")
            for cluster_id, desc in cluster_descs.items():
                print(f"\n{desc}")
                print("Example Recipes:")
                cluster_recipes = clustered_df[clustered_df['cluster'] == cluster_id]
                print(cluster_recipes[['title', 'calories']].to_string(index=False))
                
        else:
            print("No perfect matches found. Try adjusting your preferences.")
            
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Found 20 matching recipes:
                                     title  calories        meal_type  \
2               Air Fryer Teriyaki Chicken     291.0         [dinner]   
3           Instant Pot Chicken Stroganoff     357.0         [dinner]   
4                              Cowboy Soup     260.0         [dinner]   
5           Pumpkin French Toast Casserole     237.0      [breakfast]   
11                       Big Mac Casserole     319.0         [dinner]   
13  Creamy Cajun Shrimp Pasta with Sausage     444.0         [dinner]   
14         Cracked Out Tater Tot Casserole     407.0  [dinner, lunch]   
16                      Cajun Salmon Pasta     428.0         [dinner]   
21                 Loaded Potato Casserole     303.0         [dinner]   
22                       Easy Spanish Rice     206.0          [snack]   
24                       Copycat KFC Bowls     374.0  [dinner, lunch]   
25                         Fried Spaghetti     376.0  [dinner, lunch]   
26                      