In [1]:
import pandas as pd
import string
from functools import reduce
from operator import add
import ast
import re
import spacy


# Data Pre-processing

In [2]:
df=pd.read_csv('archive/Food Ingredients and Recipe Dataset with Image Name Mapping.csv',index_col=0)

In [3]:
print("DataFrame Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 13501 entries, 0 to 13500
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                13496 non-null  object
 1   Ingredients          13501 non-null  object
 2   Instructions         13493 non-null  object
 3   Image_Name           13501 non-null  object
 4   Cleaned_Ingredients  13501 non-null  object
dtypes: object(5)
memory usage: 632.9+ KB
None

Missing Values:
Title                  5
Ingredients            0
Instructions           8
Image_Name             0
Cleaned_Ingredients    0
dtype: int64


In [4]:
df.dropna(inplace=True)

# Confirm that missing values have been dropped
print("DataFrame Info after dropping missing values:")
print(df.info())

DataFrame Info after dropping missing values:
<class 'pandas.core.frame.DataFrame'>
Index: 13493 entries, 0 to 13500
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                13493 non-null  object
 1   Ingredients          13493 non-null  object
 2   Instructions         13493 non-null  object
 3   Image_Name           13493 non-null  object
 4   Cleaned_Ingredients  13493 non-null  object
dtypes: object(5)
memory usage: 632.5+ KB
None


In [5]:
df.shape

(13493, 5)

In [6]:
nc_ingred_index = [index for i, index in zip(df['Cleaned_Ingredients'], df.index) if all(j.isdigit() or j in string.punctuation for j in i)]
nc_title_index = [index for i, index in zip(df['Title'], df.index) if all(j.isdigit() or j in string.punctuation for j in i)]
nc_instr_index = [index for i, index in zip(df['Instructions'], df.index) if all(j.isdigit() or j in string.punctuation for j in i)]

# Checking number of rows in each category that are only punc/nums
index_list = [nc_ingred_index, nc_title_index, nc_instr_index]
num_to_drop = [len(x) for x in index_list]

# Generating unique indices for index_list and dropping from DataFrame
inds_to_drop = set(reduce(add, index_list))
print(len(inds_to_drop))

# Dropping rows with indices from inds_to_drop and resetting index
new_df = df.drop(index=inds_to_drop).reset_index(drop=True)
print(new_df.shape)

6
(13487, 5)


In [7]:
# Removing recipes which have too little instructions
empty_instr_ind = [index for i, index in zip(new_df['Cleaned_Ingredients'], new_df.index) if len(i) < 20]
new_df = new_df.drop(index = empty_instr_ind).reset_index(drop=True)

print(new_df.shape)
new_df.isna().sum()

(13480, 5)


Title                  0
Ingredients            0
Instructions           0
Image_Name             0
Cleaned_Ingredients    0
dtype: int64

# Tokenising

In [8]:
new_df['Cleaned_Ingredients'] = new_df['Cleaned_Ingredients'].apply(ast.literal_eval)
new_df['Title']=new_df['Title'].astype(str)

In [9]:
new_df['Title'].head(5)

0    Miso-Butter Roast Chicken With Acorn Squash Pa...
1                      Crispy Salt and Pepper Potatoes
2                          Thanksgiving Mac and Cheese
3                   Italian Sausage and Bread Stuffing
4                                         Newton's Law
Name: Title, dtype: object

In [10]:
new_df.loc[0,'Cleaned_Ingredients']

['1 (3½–4-lb.) whole chicken',
 '2¾ tsp. kosher salt, divided, plus more',
 '2 small acorn squash (about 3 lb. total)',
 '2 Tbsp. finely chopped sage',
 '1 Tbsp. finely chopped rosemary',
 '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature',
 '¼ tsp. ground allspice',
 'Pinch of crushed red pepper flakes',
 'Freshly ground black pepper',
 '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)',
 '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces',
 '2 Tbsp. extra-virgin olive oil',
 '½ small red onion, thinly sliced',
 '3 Tbsp. apple cider vinegar',
 '1 Tbsp. white miso',
 '¼ cup all-purpose flour',
 '2 Tbsp. unsalted butter, room temperature',
 '¼ cup dry white wine',
 '2 cups unsalted chicken broth',
 '2 tsp. white miso',
 'Kosher salt',
 'freshly ground pepper']

In [11]:
new_df['Ingredients_Text'] = ['; '.join(ingredients) for ingredients in new_df['Cleaned_Ingredients']]
new_df['Ingredients_Text'].head()


0    1 (3½–4-lb.) whole chicken; 2¾ tsp. kosher sal...
1    2 large egg whites; 1 pound new potatoes (abou...
2    1 cup evaporated milk; 1 cup whole milk; 1 tsp...
3    1 (¾- to 1-pound) round Italian loaf, cut into...
4    1 teaspoon dark brown sugar; 1 teaspoon hot wa...
Name: Ingredients_Text, dtype: object

In [12]:
new_df['Ingredients_Count'] = [len(ingredients) for ingredients in new_df['Cleaned_Ingredients']]

In [18]:
new_df.head()
new_df.to_csv('Processed_Recipees.csv')

In [14]:
all_text = new_df['Title'] + ' ' + new_df['Ingredients_Text'] + ' ' + new_df['Instructions']
all_text[0]

'Miso-Butter Roast Chicken With Acorn Squash Panzanella 1 (3½–4-lb.) whole chicken; 2¾ tsp. kosher salt, divided, plus more; 2 small acorn squash (about 3 lb. total); 2 Tbsp. finely chopped sage; 1 Tbsp. finely chopped rosemary; 6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature; ¼ tsp. ground allspice; Pinch of crushed red pepper flakes; Freshly ground black pepper; ⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups); 2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces; 2 Tbsp. extra-virgin olive oil; ½ small red onion, thinly sliced; 3 Tbsp. apple cider vinegar; 1 Tbsp. white miso; ¼ cup all-purpose flour; 2 Tbsp. unsalted butter, room temperature; ¼ cup dry white wine; 2 cups unsalted chicken broth; 2 tsp. white miso; Kosher salt; freshly ground pepper Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine. Let sit at room temperature 1 hour.\nMeanwhile, 

In [17]:
all_text.to_csv("All_text.csv")

In [20]:
def clean_text(documents):
    cleaned_text = []
    for doc in documents:
        doc = doc.translate(str.maketrans('', '', string.punctuation)) # Remove Punctuation
        doc = re.sub(r'\d+', '', doc) # Remove Digits
        doc = doc.replace('\n',' ') # Remove New Lines
        doc = doc.strip() # Remove Leading White Space
        doc = re.sub(' +', ' ', doc) # Remove multiple white spaces
        cleaned_text.append(doc)
    return cleaned_text

# Cleaning Text
cleaned_text = clean_text(all_text)
pd.Series(cleaned_text).to_csv('Cleaned_Text.csv')

In [16]:
all_text[2]

'Thanksgiving Mac and Cheese 1 cup evaporated milk; 1 cup whole milk; 1 tsp. garlic powder; 1 tsp. onion powder; 1 tsp. smoked paprika; ½ tsp. freshly ground black pepper; 1 tsp. kosher salt, plus more; 2 lb. extra-sharp cheddar, coarsely grated; 4 oz. full-fat cream cheese; 1 lb. elbow macaroni Place a rack in middle of oven; preheat to 400°. Bring evaporated milk and whole milk to a bare simmer in a large saucepan over medium heat. Whisk in garlic powder, onion powder, paprika, pepper, and 1 tsp. salt. Working in batches, whisk in three fourths of the cheddar, then all of the cream cheese.\nMeanwhile, bring a large pot of generously salted water to a boil (it should have a little less salt than seawater). Cook macaroni, stirring occasionally, until very al dente, about 4 minutes. Drain in a colander.\nAdd macaroni to cheese sauce in pan and mix until well coated. Evenly spread out half of macaroni mixture in a 13x9" baking dish. Sprinkle half of remaining cheddar evenly over. Layer r

In [17]:
cleaned_text[2]

'Thanksgiving Mac and Cheese cup evaporated milk cup whole milk tsp garlic powder tsp onion powder tsp smoked paprika ½ tsp freshly ground black pepper tsp kosher salt plus more lb extrasharp cheddar coarsely grated oz fullfat cream cheese lb elbow macaroni Place a rack in middle of oven preheat to ° Bring evaporated milk and whole milk to a bare simmer in a large saucepan over medium heat Whisk in garlic powder onion powder paprika pepper and tsp salt Working in batches whisk in three fourths of the cheddar then all of the cream cheese Meanwhile bring a large pot of generously salted water to a boil it should have a little less salt than seawater Cook macaroni stirring occasionally until very al dente about minutes Drain in a colander Add macaroni to cheese sauce in pan and mix until well coated Evenly spread out half of macaroni mixture in a x baking dish Sprinkle half of remaining cheddar evenly over Layer remaining macaroni mixture on top and sprinkle with remaining cheddar Bake un

# strip down the text as much as possible. In this case that means lemmatizing words and removing stop words. The goal here is not text prediction, but similarity measures and keyword extraction, which don't require the semantic granularity that stop words and non-lemmatized words might provide.

In [18]:
# Testing Strategies and Code
nlp = spacy.load("en_core_web_sm")

' '.join([token.lemma_ for token in nlp(cleaned_text[2]) if not token.is_stop])

'Thanksgiving Mac Cheese cup evaporate milk cup milk tsp garlic powder tsp onion powder tsp smoke paprika ½ tsp freshly grind black pepper tsp kosher salt plus lb extrasharp cheddar coarsely grated oz fullfat cream cheese lb elbow macaroni Place rack middle oven preheat ° bring evaporate milk milk bare simmer large saucepan medium heat Whisk garlic powder onion powder paprika pepper tsp salt Working batch whisk fourth cheddar cream cheese bring large pot generously salt water boil little salt seawater Cook macaroni stir occasionally al dente minute Drain colander Add macaroni cheese sauce pan mix coated Evenly spread half macaroni mixture x bake dish Sprinkle half remain cheddar evenly Layer remain macaroni mixture sprinkle remain cheddar Bake cheese melt minute let cool slightly serve'

In [19]:
# Tokenizing Function that lemmatizes words and removes Stop Words
def text_tokenizer(documents):
    tokenized_documents = []
    for doc in documents:
        tok_doc = ' '.join([token.lemma_ for token in nlp(doc) if not token.is_stop])
        tokenized_documents.append(tok_doc)
    return tokenized_documents

In [20]:
tokenized_text = text_tokenizer(cleaned_text)

# Creating a new File to not run the entire code again (Which takes too long).

In [21]:
pd.Series(tokenized_text).to_csv('tokenized_text.csv')

print(tokenized_text[0])
print("\n"*3)
print(tokenized_text[2])

MisoButter Roast Chicken Acorn Squash Panzanella ½–lb chicken ¾ tsp kosher salt divide plus small acorn squash lb total Tbsp finely chop sage Tbsp finely chop rosemary Tbsp unsalted butter melt plus tbsp room temperature ¼ tsp ground allspice Pinch crushed red pepper flake freshly ground black pepper ⅓ loaf goodquality sturdy white bread tear piece ½ cup medium apple Gala Pink Lady oz total core cut piece Tbsp extravirgin olive oil ½ small red onion thinly slice Tbsp apple cider vinegar Tbsp white miso ¼ cup allpurpose flour Tbsp unsalted butter room temperature ¼ cup dry white wine cup unsalte chicken broth tsp white miso Kosher salt freshly ground pepper Pat chicken dry paper towel season tsp salt tie leg kitchen twine let sit room temperature hour halve squash scoop seed run vegetable peeler ridge squash half remove skin cut half ½thick wedge arrange rimmed baking sheet Combine sage rosemary Tbsp melt butter large bowl pour half mixture squash bake sheet Sprinkle squash allspice red