In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
recipe_dataset_file_path = "../input/full_recipes_dataset.csv"
recipe_dataset = pd.read_csv(recipe_dataset_file_path)
recipe_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 58 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_id            335 non-null    int64  
 1   30_mins              335 non-null    int64  
 2   breakfast            335 non-null    int64  
 3   cookies              335 non-null    int64  
 4   cottage_cheese       335 non-null    int64  
 5   desserts             335 non-null    int64  
 6   dinner               335 non-null    int64  
 7   lunch                335 non-null    int64  
 8   meal_prep            335 non-null    int64  
 9   sauces_seasoning     335 non-null    int64  
 10  sides_appetizers     335 non-null    int64  
 11  calories             331 non-null    float64
 12  carbohydrates        324 non-null    float64
 13  cholesterol          198 non-null    float64
 14  cook_time            286 non-null    float64
 15  course               331 non-null    obj

In [3]:
# Drop empty rows and rename column
recipe_dataset.rename(columns={"sides_appetizers": "snack"}, inplace=True) 
recipe_dataset = recipe_dataset.dropna(subset=["title"])
recipe_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331 entries, 0 to 334
Data columns (total 58 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_id            331 non-null    int64  
 1   30_mins              331 non-null    int64  
 2   breakfast            331 non-null    int64  
 3   cookies              331 non-null    int64  
 4   cottage_cheese       331 non-null    int64  
 5   desserts             331 non-null    int64  
 6   dinner               331 non-null    int64  
 7   lunch                331 non-null    int64  
 8   meal_prep            331 non-null    int64  
 9   sauces_seasoning     331 non-null    int64  
 10  snack                331 non-null    int64  
 11  calories             331 non-null    float64
 12  carbohydrates        324 non-null    float64
 13  cholesterol          198 non-null    float64
 14  cook_time            286 non-null    float64
 15  course               331 non-null    object 


In [4]:
columns_string = "recipe_id,30_mins,breakfast,cookies,cottage_cheese,desserts,dinner,lunch,meal_prep,sauces_seasoning,snack,calories,carbohydrates,cholesterol,cook_time,course,cuisine,description,fat,fiber,full_name,image_url,ingredient_count,ingredients,instructions,monounsaturated_fat,notes,polyunsaturated_fat,potassium,prep_time,protein,saturated_fat,serving,sodium,sugar,title,dairy_free,gluten_free,4th_july,christmas,cinco_de_mayo,easter,fathers_day,labor_day,memorial_day,mothers_day,thanksgiving,valentienes_day,beef,chicken,pork,recipe_turkey,seafood,fall,pumpkin,spring,summer,winter,meal_type"
columns_list = columns_string.split(",")
print(columns_list)

['recipe_id', '30_mins', 'breakfast', 'cookies', 'cottage_cheese', 'desserts', 'dinner', 'lunch', 'meal_prep', 'sauces_seasoning', 'snack', 'calories', 'carbohydrates', 'cholesterol', 'cook_time', 'course', 'cuisine', 'description', 'fat', 'fiber', 'full_name', 'image_url', 'ingredient_count', 'ingredients', 'instructions', 'monounsaturated_fat', 'notes', 'polyunsaturated_fat', 'potassium', 'prep_time', 'protein', 'saturated_fat', 'serving', 'sodium', 'sugar', 'title', 'dairy_free', 'gluten_free', '4th_july', 'christmas', 'cinco_de_mayo', 'easter', 'fathers_day', 'labor_day', 'memorial_day', 'mothers_day', 'thanksgiving', 'valentienes_day', 'beef', 'chicken', 'pork', 'recipe_turkey', 'seafood', 'fall', 'pumpkin', 'spring', 'summer', 'winter', 'meal_type']


In [5]:
# Combine binary flags for meals
meal_columns = ["breakfast", "dinner", "lunch", "snack", "desserts"]

recipe_dataset["meal_type"] = recipe_dataset.apply(
    lambda row: [meal for meal in meal_columns if row[meal]],
    axis=1
)

# Remove meal types
recipe_dataset = recipe_dataset.drop(columns=[col for col in meal_columns if col in recipe_dataset.columns])

In [6]:
# Combine binary flags for season
holiday_columns = ['4th_july',
       'christmas', 'cinco_de_mayo', 'easter', 'fathers_day', 'labor_day',
       'memorial_day', 'mothers_day', 'thanksgiving', 'valentienes_day']

recipe_dataset["holiday"] = recipe_dataset.apply(
    lambda row: [meal for meal in holiday_columns if row[meal]],
    axis=1
)

# Remove meal types
recipe_dataset = recipe_dataset.drop(columns=[col for col in holiday_columns if col in recipe_dataset.columns])

In [7]:
# Combine binary flags for season
season_columns = ['fall','spring', 'summer', 'winter']

recipe_dataset["season"] = recipe_dataset.apply(
    lambda row: [meal for meal in season_columns if row[meal]],
    axis=1
)

# Remove meal types
recipe_dataset = recipe_dataset.drop(columns=[col for col in season_columns if col in recipe_dataset.columns])

In [8]:
recipe_dataset.rename(columns={"recipe_turkey": "turkey"}, inplace=True)

In [9]:
# Combine binary flags for season
protein_columns = [ 'beef', 'chicken', 'pork', 'turkey', 'seafood']

recipe_dataset["protein"] = recipe_dataset.apply(
    lambda row: [meal for meal in protein_columns if row[meal]],
    axis=1
)

# Remove meal types
recipe_dataset = recipe_dataset.drop(columns=[col for col in protein_columns if col in recipe_dataset.columns])

In [10]:
columns_to_drop_list = ['30_mins', 'cookies', 'cottage_cheese', 'meal_prep', 'sauces_seasoning', 'cholesterol',
       'cook_time', 'description', 'fiber', 'full_name', 'image_url', 'instructions', 'monounsaturated_fat', 
       'notes', 'polyunsaturated_fat', 'potassium', 'prep_time', 'saturated_fat', 'serving','sodium', 'sugar',]
recipe_dataset = recipe_dataset.drop(columns=[col for col in columns_to_drop_list if col in recipe_dataset.columns])

In [11]:
import ast 

def parse_ingredients(ingredient_str):
    return ast.literal_eval(ingredient_str)

def extract_ingredient_names(ingredient_list):
    names = []
    for ingredient in ingredient_list:
        if ingredient.get('name'):
            names.append(ingredient['name'])
    return names

recipe_dataset['parsed_ingredients'] = recipe_dataset['ingredients'].apply(parse_ingredients)

recipe_dataset['ingredient_names'] = recipe_dataset['parsed_ingredients'].apply(extract_ingredient_names)


In [12]:
columns_to_drop_list = [ 'course', 'cuisine', 'ingredient_count', 'ingredients',
        'pumpkin','parsed_ingredients']
recipe_dataset = recipe_dataset.drop(columns=[col for col in columns_to_drop_list if col in recipe_dataset.columns])

print(recipe_dataset.columns)
recipe_dataset.to_csv("../row/row_recipes.csv", index=False)

Index(['recipe_id', 'calories', 'carbohydrates', 'fat', 'protein', 'title',
       'dairy_free', 'gluten_free', 'meal_type', 'holiday', 'season',
       'ingredient_names'],
      dtype='object')


In [13]:
unwanted_columns_for_classification = ['recipe_id', 'calories', 'carbohydrates', 'fat', 'protein',
       'dairy_free', 'gluten_free', 'meal_type', 'holiday', 'season']
recipe_classifier_dataset = recipe_dataset.drop(unwanted_columns_for_classification, axis=1)
print(recipe_classifier_dataset.columns)
recipe_classifier_dataset.to_csv("../output/recipe_classifier_final_version.csv", index=False)

Index(['title', 'ingredient_names'], dtype='object')


In [14]:
# Load recipe classified dataset
classified_recipe_dataset_file_path = "../output/recipe_classified_phase_1.csv"
classified_dataset = pd.read_csv(classified_recipe_dataset_file_path)

# Merge and handle column name conflicts
merged_dataset = recipe_dataset.merge(
    classified_dataset[['title', 'diet_type', 'ingredient_names']],
    on='title',
    how='left',
    suffixes=('', '_classified')
)

# Drop the redundant ingredient_names_classified
if 'ingredient_names_classified' in merged_dataset.columns:
    merged_dataset.drop('ingredient_names_classified', axis=1, inplace=True)

# Print the cleaned DataFrame
print(merged_dataset.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   recipe_id         337 non-null    int64  
 1   calories          337 non-null    float64
 2   carbohydrates     330 non-null    float64
 3   fat               324 non-null    float64
 4   protein           337 non-null    object 
 5   title             337 non-null    object 
 6   dairy_free        337 non-null    int64  
 7   gluten_free       337 non-null    int64  
 8   meal_type         337 non-null    object 
 9   holiday           337 non-null    object 
 10  season            337 non-null    object 
 11  ingredient_names  337 non-null    object 
 12  diet_type         337 non-null    object 
dtypes: float64(3), int64(3), object(7)
memory usage: 34.4+ KB
None


In [15]:
merged_dataset.to_csv("../output/recipe_classified_phase_2.csv", index=False)

In [16]:
# Combine binary flags for season
diet_prefrences_columns = [ 'gluten_free', 'dairy_free']

merged_dataset["diet_prefrences"] = merged_dataset.apply(
    lambda row: [meal for meal in diet_prefrences_columns if row[meal]],
    axis=1
)

# Remove meal types
merged_dataset = merged_dataset.drop(columns=[col for col in diet_prefrences_columns if col in merged_dataset.columns])

In [17]:
merged_dataset.columns

Index(['recipe_id', 'calories', 'carbohydrates', 'fat', 'protein', 'title',
       'meal_type', 'holiday', 'season', 'ingredient_names', 'diet_type',
       'diet_prefrences'],
      dtype='object')

In [18]:
unwanted_columns_for_merging_full = ['title', 'calories', 'carbohydrates', 'fat']
recipe_merging_dataset = merged_dataset.drop(unwanted_columns_for_merging_full, axis=1)
print(recipe_merging_dataset.columns)
recipe_merging_dataset.to_csv("../output/recipe_classified_phase_3.csv", index=False)

Index(['recipe_id', 'protein', 'meal_type', 'holiday', 'season',
       'ingredient_names', 'diet_type', 'diet_prefrences'],
      dtype='object')
