In [2]:
import pandas as pd
import numpy as np
import json
import os

In [6]:
file_path = r'\Data\allrecipes-complete-recipes-list-by-dmitriy-zub.json'
directory = os.getcwd()
# go into data folder and find the json file
json_file_path = directory+file_path
print(json_file_path)
recipes = pd.read_json(json_file_path, orient='records')

C:\Users\Stephanie\Documents\School\8_Fall_2023\5_Natural_Language_Processing\assignments\NLP_Recipe_Guide\Data\allrecipes-complete-recipes-list-by-dmitriy-zub.json


In [7]:
recipes.head(2)

Unnamed: 0,state,basic_info,prep_data,ingridients,nutritions
0,Texas,"{'title': 'Slow Cooker Texas Pulled Pork', 'ca...","{'prep_time:': '15 mins', 'cook_time:': '5 hrs...","[1 teaspoon vegetable oil, 1 (4 pound) pork sh...","{'calories': '528', 'fat': '23g', 'carbs': '46..."
1,Texas,"{'title': 'Brazilian Grilled Pineapple', 'cate...","{'prep_time:': '10 mins', 'cook_time:': '10 mi...","[1 cup brown sugar, 2 teaspoons ground cinnamo...","{'calories': '255', 'fat': '0g', 'carbs': '66g..."


In [8]:
def extract_info(row):
    try:
        # Extract 'prep_time', 'cook_time', 'total_time', 'servings', and 'yield' dynamically
        prep_time_key = next((key for key in row['prep_data'] if 'prep_time' in key), None)
        cook_time_key = next((key for key in row['prep_data'] if 'cook_time' in key), None)
        total_time_key = next((key for key in row['prep_data'] if 'total_time' in key), None)
        servings_key = next((key for key in row['prep_data'] if 'servings' in key), None)

        # Extract nutrition information if available        
        calories = row['nutritions'].get('calories')
        fat = row['nutritions'].get('fat')
        carbs = row['nutritions'].get('carbs')
        protein = row['nutritions'].get('protein')

        return pd.Series({
            'state': row['state'],
            'title': row['basic_info']['title'],
            'ingredients': row['ingridients'],
            'category': row['basic_info']['category'],
            'rating': row['basic_info']['rating'], 
            'reviews': row['basic_info']['reviews'], 
            'recipe creator': row['basic_info']['recipe_by'],
            'prep_time': row['prep_data'][prep_time_key] if prep_time_key else None,
            'cook_time': row['prep_data'][cook_time_key] if cook_time_key else None,
            'total_time': row['prep_data'][total_time_key] if total_time_key else None,
            'servings': row['prep_data'][servings_key] if servings_key else None,
            'calories': calories,
            'fat': fat,
            'carbs': carbs,
            'protein': protein
        })
    except Exception as e:
        print("Error:", e)
        return pd.Series({})

# Apply the function to each row of the DataFrame
extracted_data = recipes.apply(extract_info, axis=1)

# Display the extracted data
extracted_data.head()

Unnamed: 0,state,title,ingredients,category,rating,reviews,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Slow Cooker Texas Pulled Pork,"[1 teaspoon vegetable oil, 1 (4 pound) pork sh...",Main Dishes,\n4.5,"\n2,214 Reviews",cmccreight,15 mins,5 hrs,5 hrs 15 mins,8,528,23g,46g,32g
1,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,\n4.9,\n68 Reviews,SoccerNut,10 mins,10 mins,20 mins,6,255,0g,66g,1g
2,Texas,Cowboy Caviar,"[1 (15.5 ounce) can black beans, drained, 1 (1...",Appetizers and Snacks,\n4.7,\n193 Reviews,Cooknik,15 mins,,35 mins,8,233,9g,32g,8g
3,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,\n4.7,\n375 Reviews,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8,372,23g,22g,19g
4,Texas,Slow Cooker Texas Smoked Beef Brisket,"[3 tablespoons smoked paprika, 2 tablespoons g...",Main Dishes,\n4.3,\n81 Reviews,Sandy Clark Gerhardt,10 mins,6 hrs,6 hrs 50 mins,4,342,16g,29g,22g


In [9]:
# Removing \n from the rating and reviews column data
extracted_data['rating'] = extracted_data['rating'].apply(lambda x: x.replace('\n', '') if x else None)
extracted_data['reviews'] = extracted_data['reviews'].apply(lambda x: x.replace('\n', '') if x else None)

In [10]:
# changing review count to be the number rather than string (from 12 Recipes to 12)
extracted_data['reviews'] = extracted_data['reviews'].str.replace(' Reviews', '')
extracted_data = extracted_data.rename(columns={'reviews': 'review_count'})
extracted_data.head(2)

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Slow Cooker Texas Pulled Pork,"[1 teaspoon vegetable oil, 1 (4 pound) pork sh...",Main Dishes,4.5,2214,cmccreight,15 mins,5 hrs,5 hrs 15 mins,8,528,23g,46g,32g
1,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,4.9,68,SoccerNut,10 mins,10 mins,20 mins,6,255,0g,66g,1g


In [11]:
# changing variables to numeric
extracted_data['rating'] = pd.to_numeric(extracted_data['rating'], errors='coerce')
extracted_data['review_count'] = pd.to_numeric(extracted_data['review_count'], errors='coerce')
extracted_data['servings'] = pd.to_numeric(extracted_data['servings'], errors='coerce')
extracted_data['calories'] = pd.to_numeric(extracted_data['calories'], errors='coerce')
extracted_data.head(2)

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Slow Cooker Texas Pulled Pork,"[1 teaspoon vegetable oil, 1 (4 pound) pork sh...",Main Dishes,4.5,,cmccreight,15 mins,5 hrs,5 hrs 15 mins,8.0,528.0,23g,46g,32g
1,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,4.9,68.0,SoccerNut,10 mins,10 mins,20 mins,6.0,255.0,0g,66g,1g


In [12]:
# removing all recipes with null values
no_null = extracted_data.dropna()
no_null_df = no_null.reset_index(drop=True)
no_null_df.head(2)

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,4.9,68.0,SoccerNut,10 mins,10 mins,20 mins,6.0,255.0,0g,66g,1g
1,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,4.7,375.0,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8.0,372.0,23g,22g,19g


In [13]:
no_null_df['ingredients'].iloc[0]

['1 cup brown sugar',
 '2 teaspoons ground cinnamon',
 '1 pineapple - peeled, cored, and cut into 6 wedges']

In [14]:
def clean_ingredient(ingredient):
    # This function removes numbers and extra spaces from the ingredient
    return ' '.join([part for part in ingredient.split() if not part.isdigit()])

def find_recipes_with_ingredients(ingredients_to_search, dataframe):
    # Clean the ingredients in the dataframe
    no_null_df['cleaned_ingredients'] = no_null_df['ingredients'].apply(lambda x: [clean_ingredient(ingredient) for ingredient in x])

    matching_recipes = no_null_df[no_null_df['cleaned_ingredients'].apply(
        lambda x: all(ingredient.lower() in ' '.join(x).lower() for ingredient in ingredients_to_search)
    )]

    return matching_recipes


In [19]:
ingredients_to_search = ['carrot']

In [20]:
matching_recipes = find_recipes_with_ingredients(ingredients_to_search, no_null_df)

In [21]:
matching_recipes

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein,cleaned_ingredients
1,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,4.7,375.0,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8.0,372.0,23g,22g,19g,"[½ cup butter, whole chicken, cut into pieces,..."
77,Colorado,Colorado Mexican Rice,"[2 cups water, 1 cup uncooked white rice, 4 me...",Side Dish,3.9,15.0,ivette,20 mins,20 mins,40 mins,6.0,359.0,18g,40g,9g,"[cups water, cup uncooked white rice, medium t..."
82,Colorado,Rocky Mountain Stew,"[2 tablespoons vegetable oil, 2 pounds sirloin...","Soups, Stews and Chili Recipes",4.6,37.0,Debbie F.,15 mins,3 hrs,3 hrs 15 mins,10.0,281.0,11g,26g,17g,"[tablespoons vegetable oil, pounds sirloin tip..."
97,Arizona,Arizona Roadhouse Chili,"[1 pound dried pinto beans, 1 pound 90%-lean g...","Soups, Stews and Chili Recipes",3.8,3.0,Karen Barris Calabro,20 mins,4 hrs 10 mins,12 hrs 30 mins,8.0,393.0,8g,53g,27g,"[pound dried pinto beans, pound 90%-lean groun..."
101,North Carolina,Hatteras Style Clam Chowder,"[1 tablespoon vegetable oil, 1 large onion, ch...","Soups, Stews and Chili Recipes",4.4,35.0,O. Romaine,20 mins,30 mins,50 mins,8.0,265.0,4g,24g,32g,"[tablespoon vegetable oil, large onion, choppe..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,Nevada,Ox Roast,"[½ cup chopped onion, ½ cup chopped celery, wi...",Main Dishes,4.7,9.0,Jon,20 mins,3 hrs,3 hrs 20 mins,30.0,229.0,10g,1g,31g,"[½ cup chopped onion, ½ cup chopped celery, wi..."
1288,Nevada,Harvest Beef Stew,"[4 tablespoons bacon drippings, ¼ cup flour, S...","Soups, Stews and Chili Recipes",4.2,10.0,CULINARYJEN,30 mins,1 hrs 30 mins,2 hrs,6.0,918.0,53g,44g,56g,"[tablespoons bacon drippings, ¼ cup flour, Sal..."
1316,Hawaii,Hawaiian Macaroni Salad,"[6 cups elbow macaroni, 2 cups mayonnaise, or ...",Salad,3.5,2.0,Jessica Daulton,10 mins,15 mins,2 hrs 25 mins,12.0,465.0,30g,41g,8g,"[cups elbow macaroni, cups mayonnaise, or more..."
1319,Hawaii,Hawaiian Bruddah Potato Mac (Macaroni) Salad,"[5 eggs, 7 large potatoes, peeled and cubed, 1...",Salad,4.7,39.0,Tuihalalalala,30 mins,20 mins,8 hrs 50 mins,20.0,387.0,28g,30g,6g,"[eggs, large potatoes, peeled and cubed, cup e..."


In [None]:
def find_recipes_with_ingredients(ingredients_to_search, dataframe):
    # Clean the ingredients in the dataframe
    dataframe['cleaned_ingredients'] = dataframe['ingredients'].apply(lambda x: [clean_ingredient(ingredient) for ingredient in x])

    matching_recipes = dataframe[dataframe['cleaned_ingredients'].apply(
        lambda x: all(ingredient.lower() in ' '.join(x).lower() for ingredient in ingredients_to_search)
    )]

    return matching_recipes

In [51]:
def find_recipes_with_ingredients(ingredients_to_search, dataframe):
    matching_recipes = dataframe[dataframe['ingredients'].apply(
        lambda x: all(ingredient.lower() in ' '.join(x).lower() for ingredient in ingredients_to_search))]
    return matching_recipes


In [52]:
ingredients_to_search = ['brown sugar', 'ground cinnamon']
matching_recipes = find_recipes_with_ingredients(ingredients_to_search, no_null_df)

In [43]:
if matching_recipes:
    print(f"Recipes containing {', '.join(input_ingredients)}: {', '.join(matching_recipes)}")
else:
    print(f"No recipes found with {', '.join(input_ingredients)}")

No recipes found with brown sugar, ground cinnamon
