In [1]:
# Read in filtered_recipes.parquet
import pandas as pd
recipes = pd.read_parquet('filtered_recipes_4.parquet')

In [2]:
from tqdm.notebook import tqdm
import numpy as np
tqdm.pandas()

In [3]:
recipes.shape

(56811, 29)

In [4]:
recipes.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions', 'AverageRating'],
      dtype='object')

In [5]:
recipes.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,AverageRating
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu...",4.333333
6,44.0,Warm Chicken A La King,1596,Joan Edington,PT3M,PT35M,PT38M,1999-09-17 04:47:00+00:00,I copied this one out of a friend's book so ma...,[https://img.sndimg.com/food/image/upload/w_55...,...,405.8,557.2,29.1,3.1,5.0,45.3,2.0,,"[Melt 1 1/2 ozs butter, add the flour and cook...",4.545455
11,49.0,Chicken Breasts Lombardi,174711,Queen Dragon Mom,PT30M,PT45M,PT1H15M,1999-08-14 19:58:00+00:00,Make and share this Chicken Breasts Lombardi r...,[https://img.sndimg.com/food/image/upload/w_55...,...,203.0,848.9,13.7,0.6,2.0,57.9,6.0,,[Cook mushrooms in 2 tbsp butter in a large s...,4.35
16,54.0,Carrot Cake,1535,Marg CaymanDesigns,PT50M,PT45M,PT1H35M,1999-09-13 15:20:00+00:00,This is one of the few recipes my husband ever...,[https://img.sndimg.com/food/image/upload/w_55...,...,69.8,534.8,67.0,1.6,47.9,5.0,12.0,1 bundt,"[Beat together the eggs, oil, and white sugar....",4.235294
18,56.0,Buttermilk Pie,1581,thefensk,PT1H,PT20M,PT1H20M,1999-08-30 10:02:00+00:00,This recipe was originally noted by my wife on...,[https://img.sndimg.com/food/image/upload/w_55...,...,101.5,269.8,52.2,0.7,39.9,5.0,8.0,,"[Preheat oven to 400°F., Beat the butter and s...",4.238095


In [6]:
recipes['RecipeIngredientQuantities'].fillna(0, inplace=True)

In [7]:
# Combine the ingrediate quantities RecipeIngredientQuantities with the RecipeIngredientParts to create a single string with the format: "<quantity> <ingredient name>"
recipes['Ingredients'] = recipes.progress_apply(lambda x: [x['RecipeIngredientQuantities'], x['RecipeIngredientParts']], axis=1)

  0%|          | 0/56811 [00:00<?, ?it/s]

In [8]:
recipes['Ingredients'].head()

2     [[1 1⁄2, 1, None, 1 1⁄2, None, 3⁄4], [sugar, l...
6     [[12, 2, 3, 450, 1, 2, 1⁄4, 1, None, None, 2, ...
11    [[2, 2, 12, 1⁄2, 1⁄3, 3⁄4, 1⁄2, 1⁄2, 1⁄2, 1⁄2,...
16    [[3, 4, 1 1⁄4, 2, 2, 2, 2, 1, 1, 1⁄2, 1, 1 1⁄4...
18    [[1⁄2, 1 1⁄2, 3, 3, 1, 1, 1, 1], [butter, marg...
Name: Ingredients, dtype: object

In [9]:
# Enable tqdm in pandas
tqdm.pandas()

# Function to combine numbers and ingredients into a single string
def combine_ingredients(row):
    numbers, ingredients = row
    if not isinstance(numbers, np.ndarray) or not isinstance(ingredients, np.ndarray):
        return np.nan
    
    combined = []
    for num, ing in zip(numbers, ingredients):
        if num is None and ing is not None:
            continue  # Skip if either number or ingredient is None
        combined.append(f"{num} {ing}")
    
    return ', '.join(combined)

# Apply the function to each row with progress bar
recipes['Combined Ingredients'] = recipes['Ingredients'].progress_apply(combine_ingredients)

# Print the result
print(recipes['Combined Ingredients'].head())

  0%|          | 0/56811 [00:00<?, ?it/s]

2     1 1⁄2 sugar, 1 lemons, rind of, 1 1⁄2 fresh water
6     12 chicken, 2 butter, 3 flour, 450 milk, 1 cel...
11    2 fresh mushrooms, 2 butter, 12 boneless skinl...
16    3 carrots, 4 eggs, 1 1⁄4 white sugar, 2 all-pu...
18    1⁄2 butter, 1 1⁄2 margarine, 3 sugar, 3 flour,...
Name: Combined Ingredients, dtype: object


In [10]:
recipes['Combined Ingredients']

2         1 1⁄2 sugar, 1 lemons, rind of, 1 1⁄2 fresh water
6         12 chicken, 2 butter, 3 flour, 450 milk, 1 cel...
11        2 fresh mushrooms, 2 butter, 12 boneless skinl...
16        3 carrots, 4 eggs, 1 1⁄4 white sugar, 2 all-pu...
18        1⁄2 butter, 1 1⁄2 margarine, 3 sugar, 3 flour,...
                                ...                        
520270    1 1⁄2 boneless skinless chicken breasts, 2 oli...
520291    1 1⁄2 lean ground beef, 5 garlic cloves, 1  - ...
520386    1 1⁄2 yukon gold potatoes, 1 salt, 1⁄2 lime, 1...
520411               6 potatoes, 2 butter, 1 onion, 4 bacon
520475                                             1 orange
Name: Combined Ingredients, Length: 56811, dtype: object

In [11]:
# Check to see if every value is a string
recipes['Combined Ingredients'].apply(lambda x: isinstance(x, str)).all()

True

In [12]:
# Create a single string adding together the Name, Description, Ingredients, Instructions, and ratings columns with the following format: "Name: <name> Description: <description> Ingredients: <ingredients> Instructions: <instructions> Ratings: <ratings>" as a function to apply via .apply
def combine_all_text_full(row):
    name, description, ingredients, instructions, ratings = row.Name, row['Description'], row['Combined Ingredients'], row['RecipeInstructions'], row['AverageRating']
    if not isinstance(name, str) or not isinstance(description, str) or not isinstance(ingredients, str) or not isinstance(instructions, np.ndarray) or not isinstance(ratings, float):
        return np.nan
    # Convert instructions to a string
    instructions = ', '.join(instructions)
    
    # Convert ratings to a string
    ratings = str(ratings)
    
    #return 'Name: ' + name + ' Description: ' + description
    return 'Name: ' + name + ' Description: ' + description + ' Ingredients: ' + ingredients + ' Instructions: ' + instructions + ' Ratings: ' + ratings
recipes['all_text_full'] = recipes.progress_apply(combine_all_text_full, axis=1)

  0%|          | 0/56811 [00:00<?, ?it/s]

In [13]:
# Create a single string adding together the Name, Description, Ingredients, Instructions, and ratings columns with the following format: "Name: <name> Description: <description> Ingredients: <ingredients> Instructions: <instructions> Ratings: <ratings>" as a function to apply via .apply
def combine_all_text(row):
    name, description, ingredients, instructions, ratings = row.Name, row['Description'], row['Combined Ingredients'], row['RecipeInstructions'], row['AverageRating']
    if not isinstance(name, str) or not isinstance(description, str) or not isinstance(ingredients, str) or not isinstance(instructions, np.ndarray) or not isinstance(ratings, float):
        return np.nan
    # Convert instructions to a string
    instructions = ', '.join(instructions)
    
    # Convert ratings to a string
    ratings = str(ratings)
    
    return 'Name: ' + name + ' Description: ' + description
    #return 'Name: ' + name + ' Description: ' + description + ' Ingredients: ' + ingredients + ' Instructions: ' + instructions + ' Ratings: ' + ratings
recipes['all_text'] = recipes.progress_apply(combine_all_text, axis=1)

  0%|          | 0/56811 [00:00<?, ?it/s]

In [14]:
recipes['all_text']

2         Name: Best Lemonade Description: This is from ...
6         Name: Warm Chicken A La King Description: I co...
11        Name: Chicken Breasts Lombardi Description: Ma...
16        Name: Carrot Cake Description: This is one of ...
18        Name: Buttermilk Pie Description: This recipe ...
                                ...                        
520270    Name: Creamy Tuscan Garlic Chicken Description...
520291    Name: Nif's Easy Korean Beef and Rice Descript...
520386    Name: Cuban Mojo Potatoes Description: Make an...
520411    Name: Dutch Fried Potatoes (Gebakken Aardappel...
520475    Name: Cinnamony Sweet Moroccan Orange Salad De...
Name: all_text, Length: 56811, dtype: object

In [15]:
print(recipes['all_text'].apply(lambda x: isinstance(x, str)).all()) # Check to see if every value is a string (True)

# Find values that are not strings
recipes[recipes['all_text'].apply(lambda x: isinstance(x, str) == False)]

# Drop rows that are not strings
recipes.drop(recipes[recipes['all_text'].apply(lambda x: isinstance(x, str) == False)].index, inplace=True)

False


In [16]:
# Save as parquet file
recipes.to_parquet('recipes_with_all_text.parquet')

In [17]:
just_all_text = pd.DataFrame(recipes[['all_text', 'all_text_full']])

In [18]:
just_all_text.to_parquet('all_text.parquet')

In [19]:
just_all_text.head(10)

Unnamed: 0,all_text,all_text_full
2,Name: Best Lemonade Description: This is from ...,Name: Best Lemonade Description: This is from ...
6,Name: Warm Chicken A La King Description: I co...,Name: Warm Chicken A La King Description: I co...
11,Name: Chicken Breasts Lombardi Description: Ma...,Name: Chicken Breasts Lombardi Description: Ma...
16,Name: Carrot Cake Description: This is one of ...,Name: Carrot Cake Description: This is one of ...
18,Name: Buttermilk Pie Description: This recipe ...,Name: Buttermilk Pie Description: This recipe ...
20,Name: Low-Fat Burgundy Beef & Vegetable Stew D...,Name: Low-Fat Burgundy Beef & Vegetable Stew D...
24,"Name: Black Bean, Corn, and Tomato Salad Descr...","Name: Black Bean, Corn, and Tomato Salad Descr..."
27,Name: Black Coffee Barbecue Sauce Description:...,Name: Black Coffee Barbecue Sauce Description:...
32,Name: Chicken and Dumplings Description: Make ...,Name: Chicken and Dumplings Description: Make ...
33,Name: Brownie Pudding Description: Make and sh...,Name: Brownie Pudding Description: Make and sh...


In [20]:
# Read just_all_text.parquet
import pandas as pd
just_all_text = pd.read_parquet('all_text.parquet')

In [21]:
# Remove all rows with NaN values
just_all_text.dropna(inplace=True)

In [22]:
# rewrite just_all_text to parquet file
just_all_text.to_parquet('all_text.parquet')