In [1]:
import json

import pandas as pd


def filter_ingredients(ingredients_string, desired, expected):
    # Convert string representation of list to an actual list
    try:
        ingredients_list = ast.literal_eval(ingredients_string)  # Use ast.literal_eval for safety
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing ingredients: {ingredients_string}\nError: {e}")
        return False  # Return False if the format is incorrect

    # Ensure the recipe contains at least 5 ingredients
    if len(ingredients_list) < 5:
        return False

    # Convert each ingredient in the recipe list to lowercase for case-insensitive matching
    ingredients_list = [ingredient.lower().strip() for ingredient in ingredients_list]

    # Combine desired and expected ingredients
    combined_ingredients = expected | desired

    # Check if the ingredients in the recipe are a subset of the combined expected and desired ingredients
    is_subset = set(ingredients_list).issubset(combined_ingredients)

    # Ensure there is at least one ingredient from the desired set
    has_desired = any(ingredient in desired for ingredient in ingredients_list)

    # Return True if the recipe ingredients are a subset and contain at least one desired ingredient
    return is_subset and has_desired


# File paths
# file_path = '/mnt/home2/recipe_dataset/full_dataset.csv'
file_path = 'D:/Programovani/SU2/RecipeNLG_dataset.csv'
exp_ing_file = "expected_ingredients.json"
desired_ing_file = "desired_ingredients.json"

# Load data
try:
    df = pd.read_csv(file_path)
    print(f"Total recipes loaded: {len(df)}")
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
    df = pd.DataFrame()




Total recipes loaded: 2231142


In [2]:
import ast

# Load expected and desired ingredients
try:
    with open(exp_ing_file, "r") as file:
        expected_ing = json.load(file)
    expected_ing = set(map(str.lower, expected_ing))

    with open(desired_ing_file, "r") as file:
        desired_ing = json.load(file)
    desired_ing = set(map(str.lower, desired_ing))
except FileNotFoundError as e:
    print(f"Error loading ingredient files: {e}")
    expected_ing = set()
    desired_ing = set()

# Proceed if data and ingredients are loaded successfully
if not df.empty and expected_ing and desired_ing:
    # Apply the function and filter rows
    try:
        filtered_mask = df['NER'].apply(filter_ingredients, args=(desired_ing, expected_ing))
        filtered_recipes = df[filtered_mask].reset_index(drop=True)

        # Print the filtered recipes
        columns = ["directions", "ingredients", "NER"]
        filtered_recipes_subset = filtered_recipes[columns] if not filtered_recipes.empty else pd.DataFrame()

        print(filtered_recipes_subset.head())
        print(f"Number of filtered recipes: {len(filtered_recipes_subset)}")
    except KeyError as e:
        print(f"Key error: {e}. Please check if the 'ingredients' column is correctly named.")
else:
    print("Either the data or ingredient lists could not be loaded successfully.")



                                          directions  \
0  ["Mix 3 tablespoons fat with granulated sugar....   
1  ["Combine water, butter, sugar and salt in sau...   
2  ["Sift flour with baking powder, salt and suga...   
3  ["Chop or cut up in small pieces; let stand at...   
4  ["Combine sugar, eggs, evaporated milk and van...   

                                         ingredients  \
0  ["3 Tbsp. softened butter or margarine", "1/2 ...   
1  ["1 c. water", "1/2 c. butter", "1 tsp. sugar"...   
2  ["1 c. sifted all-purpose flour", "2 tsp. baki...   
3  ["1 head cabbage", "1 onion", "1 bell pepper",...   
4  ["6 eggs", "1 1/2 c. sugar", "1 can evaporated...   

                                                 NER  
0  ["butter", "sugar", "egg", "flour", "baking po...  
1      ["water", "butter", "sugar", "flour", "eggs"]  
2  ["flour", "baking powder", "salt", "sugar", "e...  
3  ["cabbage", "onion", "bell pepper", "tomato", ...  
4          ["eggs", "sugar", "milk", "milk", "milk"

In [5]:
# File path to save the filtered recipes
output_file_path = "filtered_recipes_petr.csv"

# Save the filtered DataFrame to a CSV file
filtered_recipes_subset.to_csv(output_file_path, index=False)

In [3]:
desired_ing1 =  {"chilli pepper", "eggplant", "lettuce", "ginger", "chicken", "onion", "garlic", "pasta", "eggs", "tomato", "potato", "rice", "egg"}

filtered_mask1 = filtered_recipes_subset['NER'].apply(filter_ingredients, args=(desired_ing1, expected_ing))
filtered_recipes1 = filtered_recipes_subset[filtered_mask1].reset_index(drop=True)
print(filtered_recipes1.head())
print(len(filtered_recipes1))

                                          directions  \
0  ["Mix 3 tablespoons fat with granulated sugar....   
1  ["Combine water, butter, sugar and salt in sau...   
2  ["Sift flour with baking powder, salt and suga...   
3  ["Combine sugar, eggs, evaporated milk and van...   
4  ["Preheat oven to 425\u00b0.", "Grease a 10-in...   

                                         ingredients  \
0  ["3 Tbsp. softened butter or margarine", "1/2 ...   
1  ["1 c. water", "1/2 c. butter", "1 tsp. sugar"...   
2  ["1 c. sifted all-purpose flour", "2 tsp. baki...   
3  ["6 eggs", "1 1/2 c. sugar", "1 can evaporated...   
4  ["2 eggs", "1/2 c. flour", "1/2 c. milk", "2 T...   

                                                 NER  
0  ["butter", "sugar", "egg", "flour", "baking po...  
1      ["water", "butter", "sugar", "flour", "eggs"]  
2  ["flour", "baking powder", "salt", "sugar", "e...  
3          ["eggs", "sugar", "milk", "milk", "milk"]  
4  ["eggs", "flour", "milk", "butter", "sugar", "..

Kód kde chceme, aby vyfiltrované recepty obsahovaly nějakou specifickou ingredienci. Stává se totiž, že se recepty dají poskládat jen z expected ingredinets.

In [9]:
 # Example of the specially chosen ingredient
special_ingredient = "chicken"

def contains_special_ingredient(ingredients_series, special_ingredient):
    try:
        # Parse the string representation of the list
        ingredients_list = ast.literal_eval(ingredients_series)
    except (ValueError, SyntaxError):
        return False  # Skip rows with malformed data

    # Ensure we work with lowercase and clean ingredient strings
    ingredients_list = [ingredient.lower().strip() for ingredient in ingredients_list]

    # Check if the special ingredient is present in any form
    return any(special_ingredient in ingredient for ingredient in ingredients_list)

# Apply the function and filter the DataFrame
special_ingredient_filtered_df = filtered_recipes1[filtered_recipes1['NER'].apply(contains_special_ingredient, special_ingredient=special_ingredient)]

# Reset the index for the new filtered DataFrame
special_ingredient_filtered_df = special_ingredient_filtered_df.reset_index(drop=True)
print(special_ingredient_filtered_df.head())

                                          directions  \
0  ["Cover chicken with", "water.", "Salt and pep...   
1  ["Saute onion in oil. Add rice; stir constantl...   
2  ["Combine soy sauce, honey, oil, garlic and Fi...   
3  ["Boil chicken with skin on until done when it...   
4  ["Combine butter and flour. Let cook until but...   

                                         ingredients  \
0  ["2 1/2 lb. chicken, cut up", "1 tsp. salt", "...   
1  ["2 Tbsp. oil", "1 1/2 c. rice (not instant)",...   
2  ["2/3 c. soy sauce", "1/2 c. honey", "2 Tbsp. ...   
3  ["chicken", "rice", "onion", "minced garlic", ...   
4  ["2 c. milk", "4 Tbsp. flour", "4 Tbsp. butter...   

                                                 NER  
0  ["chicken", "salt", "pepper", "flour", "milk",...  
1  ["oil", "rice", "onion", "tomato", "garlic", "...  
2  ["soy sauce", "honey", "vegetable oil", "garli...  
3     ["chicken", "rice", "onion", "garlic", "salt"]  
4  ["milk", "flour", "butter", "salt", "pepper", ..

Tady z vyfiltrovaných receptů vezmeme jen ty recepty, které obsahují aspoň 1/3 desired ingreds, ze stejného důvodu jako nad - recepty jdou poskládat jen z expected ingreds. Zde celekem problém, protože recepty co vyšly jsou naprosto random - jen jeden z nich působí normálně.

In [4]:
def contains_half_or_more_desired_ingredients(ingredients_series, desired_ing1):

    try:
        # Convert the string representation of the list to an actual list
        ingredients_list = ast.literal_eval(ingredients_series)
    except (ValueError, SyntaxError):
        return False  # Skip malformed rows

    # Convert to lowercase and clean up whitespace for consistent comparison
    ingredients_list = [ingredient.lower().strip() for ingredient in ingredients_list]

    # Count how many ingredients from the recipe match the desired ingredients
    matching_count = sum(1 for ingredient in ingredients_list if any(desired in ingredient for desired in desired_ing1))
    
    # Check if the count of matching ingredients is at least half the size of the desired ingredients
    return matching_count >= len(desired_ing1) *1/3

filtered_recipes2 = filtered_recipes1[
    filtered_recipes1['NER'].apply(contains_half_or_more_desired_ingredients, args=(desired_ing1,))
]

# Reset index for clean output
filtered_recipes2 = filtered_recipes2.reset_index(drop=True)

# Output the result
print(f"Number of recipes containing at least half of the desired ingredients: {len(filtered_recipes2)}")
print(filtered_recipes2.head())

Number of recipes containing at least half of the desired ingredients: 4
                                          directions  \
0  ["Saute onion in oil. Add rice; stir constantl...   
1  ["Once you decide on a pan, measure its total ...   
2  ["In microwave, cook chicken livers and fat un...   
3  ["Preheat oven to 400.", "Clean chicken and pu...   

                                         ingredients  \
0  ["2 Tbsp. oil", "1 1/2 c. rice (not instant)",...   
1  ["5 eggs", "1 1/2 c. milk", "1 1/2 c. flour", ...   
2  ["1 lb chicken liver", "3 tablespoons chicken ...   
3  ["8 chicken thighs", "2 cups chicken stock or ...   

                                                 NER  
0  ["oil", "rice", "onion", "tomato", "garlic", "...  
1  ["eggs", "milk", "flour", "butter", "eggs", "m...  
2  ["chicken", "chicken", "onion", "eggs", "salt"...  
3  ["chicken", "chicken", "salt", "pepper", "garl...  
