### Static Data Collection (RecipeNLG Dataset)

In [1]:
# ! pip install pandas numpy spacy

# python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import numpy as np
import re
import spacy

In [3]:
receipenlg = pd.read_csv("/Users/sandhyakilari/Desktop/MLOps Project/dataset/RecipeNLG_dataset.csv")

In [4]:
receipenlg.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [5]:
receipenlg.reset_index(drop=True, inplace=True)

In [6]:
receipenlg.duplicated().sum()

0

In [7]:
receipenlg.dtypes

Unnamed: 0      int64
title          object
ingredients    object
directions     object
link           object
source         object
NER            object
dtype: object

In [8]:
receipenlg.shape

(2231142, 7)

In [9]:
receipenlg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Unnamed: 0   int64 
 1   title        object
 2   ingredients  object
 3   directions   object
 4   link         object
 5   source       object
 6   NER          object
dtypes: int64(1), object(6)
memory usage: 119.2+ MB


In [10]:
receipenlg.describe(include=object)

Unnamed: 0,title,ingredients,directions,link,source,NER
count,2231141,2231142,2231142,2231142,2231142,2231142
unique,1312870,2226362,2211644,2231142,2,2133496
top,Chicken Casserole,"[""1 c. peanut butter"", ""1 c. sugar"", ""1 egg""]","[""Mix all ingredients together.""]",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,[]
freq,4099,28,274,1,1643098,573


In [11]:
receipenlg.columns

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')

### Data cleaning

In [12]:
# Drop unnamed index column
receipenlg.drop(columns=["Unnamed: 0"], inplace=True)

In [13]:
# Remove duplicates
receipenlg.drop_duplicates(inplace=True)

In [14]:
# Drop rows where essential columns are missing
receipenlg.dropna(subset=["title", "ingredients", "directions"], inplace=True)

In [15]:
# Clean text fields
for col in ["title", "ingredients", "directions", "link", "source", "NER"]:
    receipenlg[col] = receipenlg[col].astype(str).str.strip()
    receipenlg[col] = receipenlg[col].apply(lambda x: re.sub(r"\s+", " ", x))  # Remove excessive spaces
    receipenlg[col] = receipenlg[col].apply(lambda x: re.sub(r"[^\w\s.,-]", "", x))  # Remove unwanted characters
    receipenlg[col] = receipenlg[col].str.lower()

In [16]:
receipenlg.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,no-bake nut cookies,"1 c. firmly packed brown sugar, 12 c. evaporat...","in a heavy 2-quart saucepan, mix brown sugar, ...",www.cookbooks.comrecipe-details.aspxid44874,gathered,"brown sugar, milk, vanilla, nuts, butter, bite..."
1,jewell balls chicken,"1 small jar chipped beef, cut up, 4 boned chic...","place chipped beef on bottom of baking dish., ...",www.cookbooks.comrecipe-details.aspxid699419,gathered,"beef, chicken breasts, cream of mushroom soup,..."
2,creamy corn,"2 16 oz. pkg. frozen corn, 1 8 oz. pkg. cream ...","in a slow cooker, combine all ingredients. cov...",www.cookbooks.comrecipe-details.aspxid10570,gathered,"frozen corn, cream cheese, butter, garlic powd..."
3,chicken funny,"1 large whole chicken, 2 10 12 oz. cans chicke...","boil and debone chicken., put bite size pieces...",www.cookbooks.comrecipe-details.aspxid897570,gathered,"chicken, chicken gravy, cream of mushroom soup..."
4,reeses cupscandy,"1 c. peanut butter, 34 c. graham cracker crumb...",combine first four ingredients and press in 13...,www.cookbooks.comrecipe-details.aspxid659239,gathered,"peanut butter, graham cracker crumbs, butter, ..."


In [17]:
receipenlg.dtypes

title          object
ingredients    object
directions     object
link           object
source         object
NER            object
dtype: object

In [19]:
receipenlg.isnull().sum()

title          0
ingredients    0
directions     0
link           0
source         0
NER            0
dtype: int64

In [20]:
receipenlg.shape

(2231141, 6)