Read the Json files and convert it to csv

In [None]:
import zipfile
import pandas as pd
import json
import os

In [None]:

# Unzip the file
zip_path = '/content/recipes_raw.zip'
extract_path = '/content/recipes_raw'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Convert each JSON file to CSV
for filename in os.listdir(extract_path):
    if filename.endswith('.json'):
        json_path = os.path.join(extract_path, filename)
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Assuming JSON structure is a dict of recipes
        df = pd.DataFrame(data.values())
        csv_path = os.path.join(extract_path, filename.replace('.json', '.csv'))
        df.to_csv(csv_path, index=False)
        print(f"Converted {filename} to {csv_path}")


Converted recipes_raw_nosource_epi.json to /content/recipes_raw/recipes_raw_nosource_epi.csv
Converted recipes_raw_nosource_fn.json to /content/recipes_raw/recipes_raw_nosource_fn.csv
Converted recipes_raw_nosource_ar.json to /content/recipes_raw/recipes_raw_nosource_ar.csv


Read data from each csv and drop unnecessory data

In [None]:
csv_folder = '/content/recipes_raw'
for file in os.listdir(csv_folder):
    if file.endswith('.csv'):
        path = os.path.join(csv_folder, file)
        df = pd.read_csv(path)
        print(f"\nColumns in {file}:")
        print(df.columns.tolist())  # Shows all column names



Columns in recipes_raw_nosource_epi.csv:
['ingredients', 'picture_link', 'instructions', 'title']

Columns in recipes_raw_nosource_fn.csv:
['instructions', 'ingredients', 'title', 'picture_link']

Columns in recipes_raw_nosource_ar.csv:
['title', 'ingredients', 'instructions', 'picture_link']


In [None]:
cols_to_drop = ['picture_link']
for file in os.listdir(csv_folder):
    if file.endswith('.csv'):
        path = os.path.join(csv_folder, file)
        df = pd.read_csv(path)

        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

        # Save to new cleaned file
        new_path = os.path.join(csv_folder, f"cleaned_{file}")
        df.to_csv(new_path, index=False)
        df.isna().sum()
        print(f"Saved cleaned file to {new_path}")

Saved cleaned file to /content/recipes_raw/cleaned_recipes_raw_nosource_epi.csv
Saved cleaned file to /content/recipes_raw/cleaned_recipes_raw_nosource_fn.csv
Saved cleaned file to /content/recipes_raw/cleaned_recipes_raw_nosource_ar.csv


Check if any csv file has any null column

In [None]:
for file in os.listdir(csv_folder):
    if file.startswith('cleaned_') and file.endswith('.csv'):
        path = os.path.join(csv_folder, file)
        df = pd.read_csv(path)

        null_counts = df.isna().sum()
        if null_counts.any():
            print(f"\n{file} has null values:")
            print(null_counts[null_counts > 0])
        else:
            print(f"\n{file} has no null values.")


cleaned_recipes_raw_nosource_fn.csv has null values:
instructions    411
ingredients     237
title           289
dtype: int64

cleaned_recipes_raw_nosource_ar.csv has null values:
title           280
ingredients     280
instructions    280
dtype: int64

cleaned_recipes_raw_nosource_epi.csv has null values:
instructions    23
dtype: int64


dropping the rows from each csv, if it has any missiong data/ column

In [None]:
for file in os.listdir(csv_folder):
    if file.startswith('cleaned_') and file.endswith('.csv'):
        path = os.path.join(csv_folder, file)
        df = pd.read_csv(path)

        # Drop rows with any NaNs
        df_cleaned = df.dropna()

        # Save to a new file
        new_path = os.path.join(csv_folder, f"final_{file}")
        df_cleaned.to_csv(new_path, index=False)
        print(f"Dropped rows with NaNs and saved to {new_path}")

Dropped rows with NaNs and saved to /content/recipes_raw/final_cleaned_recipes_raw_nosource_fn.csv
Dropped rows with NaNs and saved to /content/recipes_raw/final_cleaned_recipes_raw_nosource_ar.csv
Dropped rows with NaNs and saved to /content/recipes_raw/final_cleaned_recipes_raw_nosource_epi.csv


Combining all 3 csvs to process the whole data

In [None]:
final_dfs = []

for file in os.listdir(csv_folder):
    if file.startswith('final_') and file.endswith('.csv'):
        path = os.path.join(csv_folder, file)
        df = pd.read_csv(path)
        final_dfs.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(final_dfs, ignore_index=True)

# Save combined file
combined_df.to_csv(os.path.join(csv_folder, 'all_recipes_combined.csv'), index=False)
print("Combined all CSVs into all_recipes_combined.csv")

Combined all CSVs into all_recipes_combined.csv


check for duplicate or  na data

In [None]:
df = pd.read_csv('/content/recipes_raw/all_recipes_combined.csv')
df.duplicated().sum()
df_cleaned = df.drop_duplicates()

# Save the cleaned version without duplicates
df_cleaned.to_csv('/content/recipes_raw/all_recipes_combined_cleaned.csv', index=False)
print("Removed duplicates and saved to all_recipes_combined_cleaned.csv")

Removed duplicates and saved to all_recipes_combined_cleaned.csv


check for null data

In [None]:
df = pd.read_csv('/content/recipes_raw/all_recipes_combined_cleaned.csv')
df.isna().sum()

Unnamed: 0,0
instructions,0
ingredients,0
title,0


Add Cusine column to above data, using pre-trained model - loading tranformers

Loading the data set

In [None]:
df = pd.read_csv('/content/recipes_raw/all_recipes_combined_cleaned.csv')
df.head()

Unnamed: 0,instructions,ingredients,title
0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab
1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans
2,"In a large saucepan, let the beans soak in eno...","['1 1/2 cups dried black beans, picked over an...",Southwestern Black Bean Dip
3,Watch how to make this recipe.\nPreheat the ov...,"['1 1/4 pounds ground chuck', 'One 15-ounce ca...",Sour Cream Noodle Bake
4,Special equipment: sushi mat\nCook the brown r...,"['1 cup rice, brown, medium-grain, cooked', '1...",Sushi Renovation


Loading the pre-trained data model

In [None]:
# Unzip the file having cusine data
zip_path = '/content/cuisine.zip'
extract_path = '/content/cuisine'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


Load all cols of Cusine csv

In [None]:
cusine_df = pd.read_csv('/content/cuisine/recipes_82k.csv')
print(cusine_df.columns.tolist())  # Shows all columns

['category', 'cooking_method', 'cuisine', 'image', 'ingredients', 'prep_time', 'recipe_name', 'serves', 'tags']


In [None]:
cusine_df.head()

Unnamed: 0,category,cooking_method,cuisine,image,ingredients,prep_time,recipe_name,serves,tags
0,,['Set the racks in the middle and upper thirds...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['1 tablespoons extra virgin olive oil', '1 cu...",20 minutes,Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri..."
1,,['Place the eggs in the air fryer basket and c...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['4 large eggs', 'Salt (black pepper, everythi...",15 minutes,Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke..."
2,,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],https://www.skinnytaste.com/wp-content/uploads...,"['olive oil spray', '4 about 5 ounce each salm...",5 minutes,Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ..."
3,,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],https://www.skinnytaste.com/wp-content/uploads...,['1/2 cup freshly grated Parmesan (not pre-gra...,15 minutes,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C..."
4,,['Cook potatoes in a large pot of salted water...,['American'],https://www.skinnytaste.com/wp-content/uploads...,['3 1/2 pounds new potatoes (about 10 peeled a...,10 minutes,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar..."


In [None]:
cols_to_drop = ['image','category']
cusine_csv_folder = '/content/cuisine'
for file in os.listdir(cusine_csv_folder):
    if file.endswith('.csv'):
        path = os.path.join(cusine_csv_folder, file)
        df = pd.read_csv(path)

        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
        df.dropna(inplace=True)
        # Save to new cleaned file
        new_path = os.path.join(cusine_csv_folder, f"cleaned_{file}")
        df.to_csv(new_path, index=False)
        print(f"Saved cleaned file to {new_path}")

Saved cleaned file to /content/cuisine/cleaned_recipes_82k.csv


In [None]:
import shutil
shutil.copy('/content/cuisine/cleaned_recipes_82k.csv',
            '/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv')

'/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Force remount to refresh credentials

Mounted at /content/drive


In [None]:
import zipfile
import pandas as pd
import json
import os
import shutil
import csv

In [None]:
#Load the actulat dataset
#df  = pd.read_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv', on_bad_lines='skip', quoting=csv.QUOTE_MINIMAL, escapechar='\\')

  df  = pd.read_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv', on_bad_lines='skip', quoting=csv.QUOTE_MINIMAL, escapechar='\\')


In [None]:
cusine_df = pd.read_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv')

In [None]:
cusine_df.rename(columns={'cooking_method': 'instructions'}, inplace=True)

In [None]:
cusine_df.head()

Unnamed: 0,instructions,cuisine,ingredients,prep_time,recipe_name,serves,tags
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",20 minutes,Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri..."
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",15 minutes,Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke..."
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",5 minutes,Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ..."
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],['1/2 cup freshly grated Parmesan (not pre-gra...,15 minutes,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C..."
4,['Cook potatoes in a large pot of salted water...,['American'],['3 1/2 pounds new potatoes (about 10 peeled a...,10 minutes,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar..."


Get Cuisine column , pass it to pre-trained model

In [None]:
import ast
cuisine_column = cusine_df['cuisine']
unique_cuisines = cuisine_column.unique()
raw_list = unique_cuisines.tolist()
raw_list.extend(['[European]', '[Mediterranean]', '[Thai]','[Asian]','[spanish]'])
raw_list = [item.strip("[]").replace("'", "") for item in raw_list]
flat_list = []

for item in raw_list:
    try:
        cuisines = ast.literal_eval(item)
        if isinstance(cuisines, list):
            flat_list.extend(cuisines)
        else:
            flat_list.append(str(cuisines))
    except:
        flat_list.append(item.strip("[]'"))

# Remove duplicates and clean whitespace
candidate_labels = sorted(set(c.strip() for c in flat_list if c.strip()))
print(candidate_labels)


['American', 'American, Argentinian', 'American, Cuban, Latin', 'American, Czech, Hungarian', 'American, Greek', 'American, Italian', 'American, Latin', 'American, Mexican', 'American, Tex Mex', 'Asian', 'Asian, Chinese', 'Asian, Hawaiian', 'Brazilian', 'Chinese, Japanese', 'Cuban, Latin', 'European', 'Indian', 'Italian', 'Latin', 'Latin, Mexican', 'Mediterranean', 'Mexican', 'Tex Mex', 'Thai', 'spanish']


Using zero-shot pipeline to assign the cuisine labels to data set

In [None]:
!pip install transformers



In [None]:
from transformers import pipeline
# Load zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3",device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
shutil.copy('/content/recipes_raw/all_recipes_combined_cleaned.csv',
            '/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv')

'/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv'

In [None]:
#Load the actulat dataset
df  = pd.read_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv')

In [None]:
df.columns

Index(['instructions', 'ingredients', 'title'], dtype='object')

In [None]:
def combine_text(row):
    return f"{row['title']}. Ingredients: {row['ingredients']}. instructions: {row['instructions']}"

df['combined'] = df.apply(combine_text, axis=1)
df.to_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv', index=False)

In [None]:
df.head(10)

Unnamed: 0,instructions,ingredients,title,combined
0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab,Grammie Hamblet's Deviled Crab. Ingredients: [...
1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans,Infineon Raceway Baked Beans. Ingredients: ['2...
2,"In a large saucepan, let the beans soak in eno...","['1 1/2 cups dried black beans, picked over an...",Southwestern Black Bean Dip,Southwestern Black Bean Dip. Ingredients: ['1 ...
3,Watch how to make this recipe.\nPreheat the ov...,"['1 1/4 pounds ground chuck', 'One 15-ounce ca...",Sour Cream Noodle Bake,Sour Cream Noodle Bake. Ingredients: ['1 1/4 p...
4,Special equipment: sushi mat\nCook the brown r...,"['1 cup rice, brown, medium-grain, cooked', '1...",Sushi Renovation,"Sushi Renovation. Ingredients: ['1 cup rice, b..."
5,Heat a large nonstick skillet over medium-high...,"['1 tablespoon extra-virgin olive oil', '2 bab...",Middle-Eastern Eggplant Rounds,Middle-Eastern Eggplant Rounds. Ingredients: [...
6,Sprinkle the saffron into 1/4 cup hot water; l...,['1/2 teaspoon lightly crumbled saffron thread...,Saffron Jewel Rice,Saffron Jewel Rice. Ingredients: ['1/2 teaspoo...
7,"If using top-sliced buns, spread softened butt...",['4 top--sliced hot-dog buns (or fashion your ...,Maine Lobster Rolls,Maine Lobster Rolls. Ingredients: ['4 top--sli...
8,Cut around the stems of the peppers (reserving...,"['6 green or red bell peppers', '1/2 cup olive...",Stuffed Peppers with Pilaf,Stuffed Peppers with Pilaf. Ingredients: ['6 g...
9,Preheat the oven to 450 degrees F. Put the cau...,"['1 medium head cauliflower (about 2 pounds), ...",Cauliflower with Brown Butter and Crispy Crumbs,Cauliflower with Brown Butter and Crispy Crumb...


Predict Cuisines using classifier.

In [None]:
import pandas as pd
from tqdm import tqdm
import os
import re

# Define a regex pattern to match illegal characters
ILLEGAL_CHARACTERS_RE = re.compile(r'[^\x00-\x7F]+')

def clean_text(s):
    if isinstance(s, str):
        return ILLEGAL_CHARACTERS_RE.sub('', s)
    return s


# Define path
output_file = "/content/drive/MyDrive/recipes/all_recipes_combined_cleaned.csv"

# Your original dataframe should be loaded here
# Example: df = pd.read_csv("your_data.csv")

# Ensure 'predicted_cuisine' column exists
if os.path.exists(output_file):
    df_out = pd.read_csv(output_file)

    if 'predicted_cuisine' not in df_out.columns:
        df_out['predicted_cuisine'] = [None] * len(df_out)

    last_idx = df_out['predicted_cuisine'].last_valid_index()
    start_idx = last_idx + 1 if last_idx is not None else 0
else:
    df_out['predicted_cuisine'] = [None] * len(df)
    start_idx = 0

batch_size = 4

for i in tqdm(range(start_idx, len(df), batch_size)):
    batch_texts = df_out['combined'].iloc[i:i+batch_size].fillna("").tolist()
    results = classifier(batch_texts, candidate_labels)

    if isinstance(results, dict):
        preds = [results['labels'][0]]
    else:
        preds = [res['labels'][0] for res in results]

    df_out.loc[i:i+batch_size-1, 'predicted_cuisine'] = preds

    # Apply clean text to the dataframe but do not overwrite df
    df_cleaned = df_out.applymap(clean_text)

    # Save progress to Drive after every batch
    df_cleaned.to_csv(output_file, index=False)


  df_cleaned = df_out.applymap(clean_text)
  0%|          | 26/30751 [2:36:50<2647:40:48, 310.22s/it]

In [None]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,instructions,ingredients,title,combined,predicted_cuisine
0,0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab,Grammie Hamblet's Deviled Crab. Ingredients: [...,American
1,1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans,Infineon Raceway Baked Beans. Ingredients: ['2...,"American, Greek"
2,2,"In a large saucepan, let the beans soak in eno...","['1 1/2 cups dried black beans, picked over an...",Southwestern Black Bean Dip,Southwestern Black Bean Dip. Ingredients: ['1 ...,"American, Cuban, Latin"
3,3,Watch how to make this recipe.\nPreheat the ov...,"['1 1/4 pounds ground chuck', 'One 15-ounce ca...",Sour Cream Noodle Bake,Sour Cream Noodle Bake. Ingredients: ['1 1/4 p...,American
4,4,Special equipment: sushi mat\nCook the brown r...,"['1 cup rice, brown, medium-grain, cooked', '1...",Sushi Renovation,"Sushi Renovation. Ingredients: ['1 cup rice, b...",Asian
5,5,Heat a large nonstick skillet over medium-high...,"['1 tablespoon extra-virgin olive oil', '2 bab...",Middle-Eastern Eggplant Rounds,Middle-Eastern Eggplant Rounds. Ingredients: [...,Mediterranean
6,6,Sprinkle the saffron into 1/4 cup hot water; l...,['1/2 teaspoon lightly crumbled saffron thread...,Saffron Jewel Rice,Saffron Jewel Rice. Ingredients: ['1/2 teaspoo...,"Cuban, Latin"
7,7,"If using top-sliced buns, spread softened butt...",['4 top--sliced hot-dog buns (or fashion your ...,Maine Lobster Rolls,Maine Lobster Rolls. Ingredients: ['4 top--sli...,American
8,8,Cut around the stems of the peppers (reserving...,"['6 green or red bell peppers', '1/2 cup olive...",Stuffed Peppers with Pilaf,Stuffed Peppers with Pilaf. Ingredients: ['6 g...,Mediterranean
9,9,Preheat the oven to 450 degrees F. Put the cau...,"['1 medium head cauliflower (about 2 pounds), ...",Cauliflower with Brown Butter and Crispy Crumbs,Cauliflower with Brown Butter and Crispy Crumb...,Mediterranean


In [None]:
training_df  = pd.read_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv');

In [None]:
training_df.head(5)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu...",1 tablespoons extra virgin olive oil 1 cup cho...
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi...",4 large eggs salt black pepper everything bage...
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm...",olive oil spray 4 about 5 ounce each salmon fi...
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",15.0,['1/2 cup freshly grated Parmesan (not pre-gra...,12 cup freshly grated parmesan not pregrated u...
4,['Cook potatoes in a large pot of salted water...,['American'],['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",10.0,['3 1/2 pounds new potatoes (about 10 peeled a...,3 12 pounds new potatoes about 10 peeled and c...


In [None]:
training_df['prep_time_in_minutes'] = training_df['prep_time'].str.extract('(\d+)').astype(float)

In [None]:
training_df.drop(columns=['prep_time'], inplace=True)
training_df.to_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv', index=False)


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv')

In [None]:
train_df.head(5)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu...",1 tablespoons extra virgin olive oil 1 cup cho...
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi...",4 large eggs salt black pepper everything bage...
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm...",olive oil spray 4 about 5 ounce each salmon fi...
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",15.0,['1/2 cup freshly grated Parmesan (not pre-gra...,12 cup freshly grated parmesan not pregrated u...
4,['Cook potatoes in a large pot of salted water...,['American'],['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",10.0,['3 1/2 pounds new potatoes (about 10 peeled a...,3 12 pounds new potatoes about 10 peeled and c...


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Combine text features
train_df['text'] = (
    train_df['ingredients'].fillna('') + ' ' +
    train_df['cuisine'].fillna('') + ' ' +
    train_df['recipe_name'].fillna('') +' ' +
    train_df['cooking_method'].fillna('')
)

In [None]:
train_df.head(5)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu..."
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi..."
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm..."
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",15.0,['1/2 cup freshly grated Parmesan (not pre-gra...
4,['Cook potatoes in a large pot of salted water...,['American'],['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",10.0,['3 1/2 pounds new potatoes (about 10 peeled a...


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression


# Step 2: Clean text
def clean_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = re.sub(r'\d{2,4}\s*(degrees?|f|°f|celsius)', '', text)
    text = text.replace('[', '').replace(']', '')
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

training_df['cleaned_text'] = training_df['text'].apply(clean_text)


In [None]:
training_df.head(5)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu...",1 tablespoons extra virgin olive oil 1 cup cho...
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi...",4 large eggs salt black pepper everything bage...
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm...",olive oil spray 4 about 5 ounce each salmon fi...
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",15.0,['1/2 cup freshly grated Parmesan (not pre-gra...,12 cup freshly grated parmesan not pregrated u...
4,['Cook potatoes in a large pot of salted water...,['American'],['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",10.0,['3 1/2 pounds new potatoes (about 10 peeled a...,3 12 pounds new potatoes about 10 peeled and c...


In [None]:
training_df.to_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv', index=False)

In [None]:
training_df = pd.read_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv')

In [None]:
training_df['prep_time_in_minutes'].describe()

Unnamed: 0,prep_time_in_minutes
count,4996.0
mean,42.190552
std,74.066056
min,1.0
25%,10.0
50%,20.0
75%,45.0
max,671.0


In [None]:
df_filtered = training_df[
    (training_df['prep_time_in_minutes'] > 0) &
    (training_df['prep_time_in_minutes'] <= 720)
]

In [None]:
df_filtered['prep_time_in_minutes'].describe()

In [None]:
df_filtered.head(3)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu...",1 tablespoons extra virgin olive oil 1 cup cho...
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi...",4 large eggs salt black pepper everything bage...
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm...",olive oil spray 4 about 5 ounce each salmon fi...


In [None]:
df_filtered.to_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv', index=False)

In [None]:
training_df = pd.read_csv('/content/drive/MyDrive/recipes/cleaned_recipes_82k.csv')

In [None]:
training_df.head(3)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
0,['Set the racks in the middle and upper thirds...,['American'],"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",20.0,"['1 tablespoons extra virgin olive oil', '1 cu...",1 tablespoons extra virgin olive oil 1 cup cho...
1,['Place the eggs in the air fryer basket and c...,['American'],"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",15.0,"['4 large eggs', 'Salt (black pepper, everythi...",4 large eggs salt black pepper everything bage...
2,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",5.0,"['olive oil spray', '4 about 5 ounce each salm...",olive oil spray 4 about 5 ounce each salmon fi...


In [None]:
!pip install --upgrade scikit-learn



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Split your target dataset
df_train, df_test = train_test_split(training_df, test_size=0.5, random_state=42)

# Step 2: Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000)),
    ('model', LinearRegression())
])

# Step 3: Train on A
X_train = df_train['cleaned_text']
y_train = df_train['prep_time_in_minutes'].astype(float)

pipeline.fit(X_train, y_train)

# Step 4: Predict on B
X_test = df_test['cleaned_text']
y_test_true = df_test['prep_time_in_minutes'].astype(float)
y_test_pred = pipeline.predict(X_test)

# Step 5: Evaluate
mse = mean_squared_error(y_test_true, y_test_pred) #Calculate MSE
rmse = np.sqrt(mse) #Calculate RMSE manually
print(f"Validation RMSE: {rmse:.2f} minutes")

Validation RMSE: 350.75 minutes


In [None]:
from sklearn.ensemble import RandomForestRegressor

#Using RandomForest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000)),
    ('model', RandomForestRegressor(
        n_estimators=200,
        max_depth=30,
        min_samples_split=5,
        random_state=42
    ))
])


# Step 1: Split your target dataset
df_train, df_test = train_test_split(training_df, test_size=0.5, random_state=42)


# Step 3: Train
X_train = df_train['cleaned_text']
y_train = df_train['prep_time_in_minutes'].astype(float)


pipeline.fit(X_train, y_train)

# Step 4: Predict on B
X_test = df_test['cleaned_text']
y_test_true = df_test['prep_time_in_minutes'].astype(float)
y_test_pred= pipeline.predict(X_test)


# Step 5: Evaluate
mse = mean_squared_error(y_test_true, y_test_pred) #Calculate MSE
rmse = np.sqrt(mse) #Calculate RMSE manually
print(f"Validation RMSE: {rmse:.2f} minutes")

Validation RMSE: 65.31 minutes


In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.base import TransformerMixin

# Step 1: Custom transformer to convert sparse TF-IDF matrix to dense
class ToDenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.toarray()

# Step 2: Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=3000)),
    ('to_dense', ToDenseTransformer()),  # Convert to dense
    ('model', HistGradientBoostingRegressor(
        max_iter=200,
        max_depth=10,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 3: Split data
df_train, df_test = train_test_split(training_df, test_size=0.5, random_state=42)

X_train = df_train['cleaned_text']
y_train = df_train['prep_time_in_minutes'].astype(float)

# Step 4: Train model
pipeline.fit(X_train, y_train)

# Step 5: Predict
X_test = df_test['cleaned_text']
y_test_true = df_test['prep_time_in_minutes'].astype(float)
y_test_pred = pipeline.predict(X_test)


# Step 6: Evaluate
mse = mean_squared_error(y_test_true, y_test_pred) #Calculate MSE
rmse = np.sqrt(mse) #Calculate RMSE manually
print(f"Validation RMSE: {rmse:.2f} minutes")


Validation RMSE: 68.42 minutes


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ---- Step 1: Split into short and long recipes ----
df_short = training_df[training_df['prep_time_in_minutes'] <= 300]
df_long = training_df[training_df['prep_time_in_minutes'] > 300]

# ---- Step 2: Split each into train/test ----
df_short_train, df_short_test = train_test_split(df_short, test_size=0.2, random_state=42)
df_long_train, df_long_test = train_test_split(df_long, test_size=0.2, random_state=42)

# ---- Step 3: Build pipeline ----
def build_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=3000)),
        ('model', RandomForestRegressor(
            n_estimators=200,
            max_depth=30,
            min_samples_split=5,
            random_state=42
        ))
    ])

pipeline_short = build_pipeline()
pipeline_long = build_pipeline()

# ---- Step 4: Train both models ----
pipeline_short.fit(df_short_train['cleaned_text'], df_short_train['prep_time_in_minutes'])
pipeline_long.fit(df_long_train['cleaned_text'], df_long_train['prep_time_in_minutes'])

# ---- Step 5: Predict and Evaluate ----
# SHORT
y_short_true = df_short_test['prep_time_in_minutes']
y_short_pred = pipeline_short.predict(df_short_test['cleaned_text'])
mse_short = mean_squared_error(y_short_true, y_short_pred)  # Calculate MSE
rmse_short = np.sqrt(mse_short)  # Calculate RMSE manually

# LONG
y_long_true = df_long_test['prep_time_in_minutes']
y_long_pred = pipeline_long.predict(df_long_test['cleaned_text'])
mse_long = mean_squared_error(y_long_true, y_long_pred)  # Calculate MSE
rmse_long = np.sqrt(mse_long)  # Calculate RMSE manually

# ---- Step 6: Report ----
print(f"Short recipes RMSE (≤300 min): {rmse_short:.2f} minutes")
print(f"Long recipes RMSE (>300 min): {rmse_long:.2f} minutes")


Short recipes RMSE (≤300 min): 31.83 minutes
Long recipes RMSE (>300 min): 120.21 minutes


In [None]:
y_train_log = np.log1p(df_long_train['prep_time_in_minutes'])
pipeline_long.fit(df_long_train['cleaned_text'], y_train_log)

# Predict and revert log
y_pred_log = pipeline_long.predict(df_long_test['cleaned_text'])
y_pred = np.expm1(y_pred_log)

# Evaluate

mse_long = mean_squared_error(y_long_true, y_pred)  # Calculate MSE
rmse_long = np.sqrt(mse_long)  # Calculate RMSE manually

print(f"Updated Long recipes RMSE: {rmse_long:.2f} minutes")

Updated Long recipes RMSE: 121.70 minutes


In [None]:
df_long.head(3)

Unnamed: 0,cooking_method,cuisine,ingredients,recipe_name,serves,tags,prep_time_in_minutes,text,cleaned_text
84,"['To make the chicken:', 'Cut the chicken thig...",['Mediterranean'],"['1 pound boneless (skinless chicken thighs)',...",Chicken Tzatziki Bowl,4 servings,"Gluten Free, Kid Friendly",85.0,"['1 pound boneless (skinless chicken thighs)',...",1 pound boneless skinless chicken thighs 14 cu...
92,"['Slow Cooker Directions:', 'Place the onions ...",['American'],"['For the sauce', '1 onion (chopped)', '2 14-o...",Moroccan Meatballs,4 servings,"Dairy Free, Freezer Meals, Gluten Free, Paleo,...",240.0,"['For the sauce', '1 onion (chopped)', '2 14-o...",for the sauce 1 onion chopped 2 14ounce cans n...
100,"['For the cookies:', 'In a large mixing bowl, ...",['American'],"['3 tbsp unsalted butter (softened)', '3/4 cup...",Gingerbread Christmas Tree Cookies,60 cookies,"Kid Friendly, Vegetarian Meals",150.0,"['3 tbsp unsalted butter (softened)', '3/4 cup...",3 tbsp unsalted butter softened 34 cup firmly ...


In [None]:
from collections import Counter
import re

# Combine all cleaned text for long recipes
long_texts = " ".join(df_long['cooking_method'].dropna().astype(str).tolist())

# Tokenize (split into words)
tokens = re.findall(r'\b\w+\b', long_texts.lower())

# Count frequency
token_counts = Counter(tokens)

# Get top N words
common_long_words = token_counts.most_common(50)

# Print top 50 words in long recipes
for word, count in common_long_words:
    print(f"{word}: {count}")


the: 11074
and: 8379
a: 5126
to: 4553
in: 3845
with: 3102
of: 2309
until: 2287
minutes: 1964
1: 1861
for: 1785
add: 1711
2: 1386
on: 1372
into: 1172
heat: 1120
or: 1078
about: 1068
over: 1030
bowl: 880
pan: 869
salt: 864
oven: 848
place: 838
directions: 831
it: 788
water: 773
mixture: 724
butter: 712
cook: 706
at: 687
remove: 683
sugar: 665
3: 664
4: 651
oil: 650
is: 648
from: 639
medium: 638
top: 618
degrees: 591
large: 585
pepper: 577
then: 574
bake: 574
dough: 551
inch: 549
cool: 546
stir: 529
cover: 496


In [None]:
df_actual = pd.read_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned_final.csv')

In [None]:
df_actual.head(2)

Unnamed: 0.1,Unnamed: 0,instructions,ingredients,title,combined,predicted_cuisine
0,0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab,Grammie Hamblet's Deviled Crab. Ingredients: [...,American
1,1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans,Infineon Raceway Baked Beans. Ingredients: ['2...,"American, Greek"


In [None]:
import re

# 1. Clean text function (same as you used during training)
def clean_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = re.sub(r'\d{2,4}\s*(degrees?|f|°f|celsius)', '', text)
    text = text.replace('[', '').replace(']', '')
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# 2. Create cleaned_text column for actual data
df_actual['cleaned_text'] = df_actual['combined'].fillna('').apply(clean_text)

# 3. Apply the model
df_actual['predicted_prep_time'] = pipeline_short.predict(df_actual['cleaned_text'])
df_actual['predicted_prep_time'] = df_actual['predicted_prep_time'].round(1)

# 4. Save to file
df_actual.to_csv("predicted_recipes_with_prep_time.csv", index=False)
shutil.copy("predicted_recipes_with_prep_time.csv", "/content/drive/MyDrive/predicted_recipes_with_prep_time.csv")



'/content/drive/MyDrive/predicted_recipes_with_prep_time.csv'

In [None]:
df_actual =  pd.read_csv('/content/drive/MyDrive/predicted_recipes_with_prep_time.csv')

In [None]:
df_actual.columns.tolist()

['Unnamed: 0',
 'instructions',
 'ingredients',
 'title',
 'combined',
 'predicted_cuisine',
 'cleaned_text',
 'predicted_prep_time']

In [None]:
df_actual['predicted_prep_time'].describe()

Unnamed: 0,predicted_prep_time
count,5864.0
mean,41.95191
std,12.58208
min,15.0
25%,32.975
50%,40.3
75%,49.6
max,101.2


In [None]:
df_actual.count()

Unnamed: 0,0
Unnamed: 0,5864
instructions,5864
ingredients,5864
title,5864
combined,5864
predicted_cuisine,5864
cleaned_text,5864
predicted_prep_time,5864


In [None]:
df_actual.head(3)

Unnamed: 0.1,Unnamed: 0,instructions,ingredients,title,combined,predicted_cuisine,cleaned_text,predicted_prep_time
0,0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab,Grammie Hamblet's Deviled Crab. Ingredients: [...,American,grammie hamblets deviled crab ingredients 12 c...,38.3
1,1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans,Infineon Raceway Baked Beans. Ingredients: ['2...,"American, Greek",infineon raceway baked beans ingredients 2 pou...,39.0
2,2,"In a large saucepan, let the beans soak in eno...","['1 1/2 cups dried black beans, picked over an...",Southwestern Black Bean Dip,Southwestern Black Bean Dip. Ingredients: ['1 ...,"American, Cuban, Latin",southwestern black bean dip ingredients 1 12 c...,43.1


In [None]:
df = pd.read_csv('/content/drive/MyDrive/recipes/all_recipes_combined_cleaned_final.csv')

In [None]:
import numpy as np

# Round up to next multiple of 5
df_actual['predicted_prep_time_rounded'] = df_actual['predicted_prep_time'].apply(lambda x: int(np.ceil(x / 5.0)) * 5)


In [None]:
df_actual['predicted_prep_time_rounded'].describe()

Unnamed: 0,predicted_prep_time_rounded
count,5864.0
mean,44.389495
std,12.674936
min,15.0
25%,35.0
50%,45.0
75%,50.0
max,105.0


In [None]:
df_actual.to_csv("predicted_recipes_with_prep_time.csv", index=False)
shutil.copy("predicted_recipes_with_prep_time.csv", "/content/drive/MyDrive/recipes/predicted_recipes_with_prep_time.csv")

'/content/drive/MyDrive/recipes/predicted_recipes_with_prep_time.csv'

In [None]:
import re
from fractions import Fraction

def estimate_servings(ingredients_list):
    text = " ".join(ingredients_list).lower()
    num_people = 0

    # Pattern for fractions and whole numbers (e.g., "1 1/2 cups")
    matches = re.findall(r'(\d+\s\d+/\d+|\d+/\d+|\d+)\s*(cup|cups|thighs|breasts|eggs|pieces|pounds|lbs|slices|servings?)', text)

    for amt_str, unit in matches:
        try:
            amt_val = float(sum(Fraction(s) for s in amt_str.split()))
        except:
            amt_val = 0

        if unit in ['thighs', 'breasts', 'eggs', 'pieces']:
            num_people += amt_val
        elif unit in ['pounds', 'lbs']:
            num_people += amt_val * 2
        elif unit in ['cup', 'cups']:
            num_people += amt_val / 1.5
        elif 'serv' in unit:
            return round(amt_val)

    return max(2, min(8, round(num_people)))

In [None]:
df_actual.head(3)

Unnamed: 0.1,Unnamed: 0,instructions,ingredients,title,combined,predicted_cuisine,cleaned_text,predicted_prep_time,predicted_prep_time_rounded,estimated_servings
0,0,Toss ingredients lightly and spoon into a butt...,"['1/2 cup celery, finely chopped', '1 small gr...",Grammie Hamblet's Deviled Crab,Grammie Hamblet's Deviled Crab. Ingredients: [...,American,grammie hamblets deviled crab ingredients 12 c...,38.3,40,2
1,1,Watch how to make this recipe.\nSprinkle the s...,"['2 pounds skirt steak, cut into 1/2-inch dice...",Infineon Raceway Baked Beans,Infineon Raceway Baked Beans. Ingredients: ['2...,"American, Greek",infineon raceway baked beans ingredients 2 pou...,39.0,40,4
2,2,"In a large saucepan, let the beans soak in eno...","['1 1/2 cups dried black beans, picked over an...",Southwestern Black Bean Dip,Southwestern Black Bean Dip. Ingredients: ['1 ...,"American, Cuban, Latin",southwestern black bean dip ingredients 1 12 c...,43.1,45,5


In [None]:
df_actual['estimated_servings'].describe()

Unnamed: 0,estimated_servings
count,5864.0
mean,3.177694
std,1.952132
min,2.0
25%,2.0
50%,2.0
75%,4.0
max,10.0


In [None]:
df_actual.to_csv("estimated_recipes_servings.csv", index=False)
shutil.copy("estimated_recipes_servings.csv", "/content/drive/MyDrive/recipes/estimated_recipes_servings.csv")

'/content/drive/MyDrive/recipes/estimated_recipes_servings.csv'

In [None]:
df_actual = pd.read_csv('/content/drive/MyDrive/recipes/estimated_recipes_servings.csv')

In [None]:
allergen_keywords = {
    'milk': [
        'milk', 'butter', 'cream', 'cheese', 'yogurt',
    'ghee', 'whey', 'casein', 'lactose',
    'parmesan', 'mozzarella', 'gruyère', 'gorgonzola', 'mascarpone', 'feta',
    'ricotta', 'cheddar', 'brie', 'camembert', 'provolone', 'romano', 'blue cheese',
    'goat cheese', 'paneer', 'cotija', 'queso fresco', 'neufchâtel', 'havarti',
    'colby', 'swiss', 'american cheese', 'monterey jack'
    ],
    'eggs': [
        'egg', 'eggs', 'egg yolk', 'egg white',
    'mayonnaise', 'mayo', 'albumin', 'meringue',
    'lecithin', 'egg powder', 'egg solids', 'aioli',
    'ovalbumin', 'surimi'
    ],
    'peanuts': [
        'peanut', 'peanuts'
    ],
    'tree_nuts': [
        'almond', 'walnut', 'cashew', 'pecan', 'hazelnut',
        'pistachio', 'macadamia', 'nut'
    ],
    'soy': [
        'soy', 'soybean', 'tofu', 'edamame', 'miso', 'tamari'
    ],
    'wheat': [
        'wheat', 'flour', 'bread', 'gluten', 'semolina',
    'spelt', 'durum', 'barley', 'rye', 'baguettes',
    'farina', 'couscous', 'bulgur', 'einkorn', 'emmer',
    'kamut', 'matzo', 'seitan', 'triticale', 'graham flour','crackers', 'breadcrumbs'
    ],
    'fish': [
        'fish', 'salmon', 'tuna', 'cod', 'trout',
        'anchovy', 'tilapia', 'bass'
    ],
    'shellfish': [
        'shrimp', 'crab', 'lobster', 'scallop',
        'clams', 'mussels', 'oysters'
    ]
}

In [None]:
def detect_allergens(ingredients_list):
    found = []
    for allergen, keywords in allergen_keywords.items():
        for ingredient in ingredients_list:
            item = ingredient.lower()
            if any(keyword in item for keyword in keywords):
                found.append(allergen)
                break  # avoid double-counting
    return list(set(found)) if found else ['none']


In [None]:
import ast

# Convert stringified lists into actual lists
df_actual['ingredients_list'] = df_actual['ingredients'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

In [None]:
df_actual['detected_allergens'] = df_actual['ingredients_list'].apply(detect_allergens)

In [None]:
df_actual.to_csv("recipes_with_allergens_detected.csv", index=False)
shutil.copy("recipes_with_allergens_detected.csv", "/content/drive/MyDrive/recipes/recipes_with_allergens_detected.csv")

'/content/drive/MyDrive/recipes/recipes_with_allergens_detected.csv'

In [None]:
import ast

# Count rows where ingredients_list is exactly an empty list
empty_count = df_actual['ingredients_list'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()

print(f"Rows with empty ingredient lists: {empty_count}")


Rows with empty ingredient lists: 90


In [None]:
# Remove rows where ingredients_list is empty
df_actual = df_actual[df_actual['ingredients_list'].apply(lambda x: isinstance(x, list) and len(x) > 0)].reset_index(drop=True)


In [None]:
df_actual.to_csv("recipes_with_allergens_detected.csv", index=False)
shutil.copy("recipes_with_allergens_detected.csv", "/content/drive/MyDrive/recipes/recipes_with_allergens_detected.csv")

'/content/drive/MyDrive/recipes/recipes_with_allergens_detected.csv'

In [None]:
df_actual.columns.tolist()

['Unnamed: 0',
 'instructions',
 'ingredients',
 'title',
 'combined',
 'predicted_cuisine',
 'cleaned_text',
 'predicted_prep_time',
 'predicted_prep_time_rounded',
 'estimated_servings',
 'ingredients_list',
 'detected_allergens']

In [None]:
df_actual = df_actual.drop(columns=[
    'Unnamed: 0',
    'predicted_prep_time',
    'ingredients_list'
])

In [None]:
df_actual.to_csv("final_dataset_nlp.csv", index=False)
shutil.copy("final_dataset_nlp.csv", "/content/drive/MyDrive/recipes/final_dataset_nlp.csv")

'/content/drive/MyDrive/recipes/final_dataset_nlp.csv'