In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import recipes dataset
recipes = pd.read_csv('data\RAW_recipes.csv')
recipes.isna().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

There are null values in the columns 'name' and 'description'

In [3]:
# Remove NaN values
recipes.dropna(how='any', inplace=True)
# Drop columns that are not needed
recipes.drop(columns=['contributor_id', 'submitted', 'tags', 'steps', 'description'], inplace=True)
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226657 entries, 0 to 231636
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   name           226657 non-null  object
 1   id             226657 non-null  int64 
 2   minutes        226657 non-null  int64 
 3   nutrition      226657 non-null  object
 4   n_steps        226657 non-null  int64 
 5   ingredients    226657 non-null  object
 6   n_ingredients  226657 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 13.8+ MB


Split nutrition into different columns 

In [4]:
recipes.head()

Unnamed: 0,name,id,minutes,nutrition,n_steps,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,"['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [5]:
# Get nutrition information from table description
nutritions = ['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']
# Split nutrition into different values columns
recipes[[nu for nu in nutritions]] = recipes.nutrition.str.split(",",expand=True) 
recipes.drop(columns=['nutrition'], inplace=True)
# Since the nutrition column is a string, we need to remove the characters and convert it to float
for nu in nutritions:
    recipes[nu] = recipes[nu].str.extract('(\d+.\d+)', expand=False).astype(float)

Label encode the ingredients for training

In [7]:
def clean_ingredients(ingredients):
    # Remove brackets and quotes
    ingredients = ingredients.replace('[', '').replace(']', '').replace("'", '')
    # Split the ingredients into a list of strings
    ingredients = ingredients.split(', ')
    return ingredients

# Apply the cleaning function to 'ingredients' column
recipes['ingredients_cleaned'] = recipes['ingredients'].apply(clean_ingredients)

# Save list of all ingredients
ingredients_list = []
for ingredients in recipes['ingredients_cleaned']:
    for ingredient in ingredients:
        if ingredient not in ingredients_list:
            ingredients_list.append(ingredient)
# Save list to CSV
pd.DataFrame(ingredients_list).to_csv('data\ingredients_list.csv', index=False)

In [None]:
# def map_ingredients(df):
#     # Map each ingredient to a unique integer
#     ingredients_map = {}
#     # Add ingredients to the map by iterating over df
#     for ingredients in df['ingredients_cleaned']:
#         for ingredient in ingredients:
#             if ingredient not in ingredients_map:
#                 ingredients_map[ingredient] = len(ingredients_map)
#     return ingredients_map

# # Create ingredients map
# ingredient_map = map_ingredients(recipes)
# def encode_ingredients(ingredients):
#     # Encode the ingredients using the created map
#     return [ingredient_map[ingredient] for ingredient in ingredients]
# # Apply the encoding function to 'ingredients_cleaned' column
# recipes['ingredients_encoded'] = recipes['ingredients_cleaned'].apply(encode_ingredients)


In [None]:
recipes.head()

Unnamed: 0,name,id,minutes,n_steps,ingredients,n_ingredients,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates,ingredients_cleaned
0,arriba baked winter squash mexican style,137739,55,11,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,"[winter squash, mexican seasoning, mixed spice..."
1,a bit different breakfast pizza,31490,30,9,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,"[prepared pizza crust, sausage patty, eggs, mi..."
2,all in the kitchen chili,112140,130,6,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0,"[ground beef, yellow onions, diced tomatoes, t..."
3,alouette potatoes,59389,45,11,"['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0,"[spreadable cheese with garlic and herbs, new ..."
4,amish tomato ketchup for canning,44061,190,5,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0,"[tomato juice, apple cider vinegar, sugar, sal..."
