In [75]:
import pandas as pd
import requests
import io

In [100]:
# Henry's reading in the data function
def read_data() -> pd.DataFrame:
    """
    Reads data from a predefined URL into a Pandas DataFrame.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the data from the specified URL.

    Raises:
    -------
    ValueError:
        If the data cannot be retrieved or parsed.

    Example:
    --------
    >>> df = read_data()
    >>> print(df.head())
    """
    url = "https://raw.githubusercontent.com/the-pudding/data/master/cookies/choc_chip_cookie_ingredients.csv"

    try:
        response = requests.get(url)
        response.raise_for_status()
        df = pd.read_csv(io.StringIO(response.text), index_col=0)        
        return df
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

In [101]:
raw_data = read_data()

In [102]:
raw_data

Unnamed: 0,Ingredient,Text,Recipe_Index,Rating,Quantity,Unit
1,all purpose flour,3.0 cups all purpose flour,AR_1,0.920725,3.000000,cup
2,all purpose flour,2.8000000000000003 cups all purpose flour,AR_10,0.905162,2.800000,cup
3,all purpose flour,1.1076923076923078 cups all purpose flour,AR_101,0.600000,1.107692,cup
4,all purpose flour,3.333333333333333 cups sifted all purpose flour,AR_102,0.937500,3.333333,cup
5,all purpose flour,2.0 cups all purpose flour,AR_103,0.881250,2.000000,cup
...,...,...,...,...,...,...
1295,milk chocolate chip,4.0 cups semi sweet or milk chocolate chips,Misc_46,,4.000000,cup
1298,milk chocolate chip,0.888888888889 cup milk chocolate chips,Misc_63,,0.888800,cup
1299,milk chocolate chip,2.66666666667 cups semisweet or milk chocolate...,Misc_70,,2.666600,cup
1300,milk chocolate chip,2.66666666667 cups milk chocolate chips,Misc_74,,2.666600,cup


In [78]:
print("\nUnique Ingredients:", raw_data["Ingredient"].nunique())
print(raw_data["Ingredient"].value_counts())


Unique Ingredients: 68
Ingredient
egg                    212
vanilla                196
all purpose flour      193
baking soda            187
sugar                  175
                      ... 
peanut butter chips      1
xanthan gum              1
white pepper             1
wheat                    1
zucchini                 1
Name: count, Length: 68, dtype: int64


In [79]:
unique_ingredients = raw_data["Ingredient"].unique()

In [80]:
flour_types = []
for i in unique_ingredients:
    if "flour" in i:
        flour_types.append(i)
flour_types

['all purpose flour',
 'bread flour',
 'brown rice flour',
 'cake flour',
 'flour',
 'wheat flour']

In [81]:
chocolate_types = []
for i in unique_ingredients:
    if "chocolate" in i:
        chocolate_types.append(i)
chocolate_types

['chocolate raisin',
 'bittersweet chocolate chip',
 'semisweet chocolate chip',
 'dark chocolate chip',
 'milk chocolate chip',
 'white chocolate chip']

In [82]:
sweetener_types = []
other_sweeteners = ["corn syrup", "honey", "molasses", "applesauce"]
for i in unique_ingredients:
    if "sugar" in i:
        sweetener_types.append(i)

sweetener_types += other_sweeteners
sweetener_types


['sugar',
 'light brown sugar',
 'dark brown sugar',
 'corn syrup',
 'honey',
 'molasses',
 'applesauce']

In [83]:
fat_types = ["butter", "margarine", "shortening", "vegetable oil"]
fat_types

['butter', 'margarine', 'shortening', 'vegetable oil']

In [None]:
non_other_ingredients = chocolate_types + flour_types + sweetener_types + ["egg"] + fat_types
other_ingredients = [item for item in unique_ingredients if item not in non_other_ingredients]

In [84]:
def sub_categorize_ingredient(ingredient, flour_types, sweetener_types, fat_types, chocolate_types):
    """
    Categorizes an ingredient into a specific subcategory based on predefined ingredient types.

    Parameters
    ----------
    ingredient : str
        The ingredient to categorize.
    flour_types : list of str
        A list of ingredients considered as flour.
    sweetener_types : list of str
        A list of ingredients considered as sweeteners.
    fat_types : list of str
        A list of ingredients considered as fats.
    chocolate_types : list of str
        A list of ingredients considered as chocolate.

    Returns
    -------
    str
        The category of the ingredient. Possible values:
        - "flour" if the ingredient is in `flour_types`
        - "sweetener" if the ingredient is in `sweetener_types`
        - "fat" if the ingredient is in `fat_types`
        - "egg" if the ingredient is "egg"
        - "chocolate" if the ingredient is in `chocolate_types`
        - "other" if the ingredient does not match any category

    Examples
    --------
    >>> flour_list = ["all purpose flour", "whole wheat flour"]
    >>> sweetener_list = ["sugar", "honey"]
    >>> fat_list = ["butter", "oil"]
    >>> chocolate_list = ["cocoa powder", "chocolate chips"]
    
    >>> sub_categorize_ingredient("all purpose flour", flour_list, sweetener_list, fat_list, chocolate_list)
    'flour'
    
    >>> sub_categorize_ingredient("honey", flour_list, sweetener_list, fat_list, chocolate_list)
    'sweetener'
    
    >>> sub_categorize_ingredient("egg", flour_list, sweetener_list, fat_list, chocolate_list)
    'egg'
    
    >>> sub_categorize_ingredient("cocoa powder", flour_list, sweetener_list, fat_list, chocolate_list)
    'chocolate'
    
    >>> sub_categorize_ingredient("vanilla extract", flour_list, sweetener_list, fat_list, chocolate_list)
    'other'
    """
    if ingredient in flour_types:
        return "flour"
    elif ingredient in sweetener_types:
        return "sweetener"
    elif ingredient in fat_types:
        return "fat"
    elif ingredient == "egg":
        return "egg"
    elif ingredient == chocolate_types:
        return "chocolate"
    else:
        return "other"

In [89]:
data_with_categories = raw_data.copy()

In [90]:
data_with_categories["subcategory"] = data_with_categories["Ingredient"].apply(
    sub_categorize_ingredient,
    args=(flour_types, sweetener_types, fat_types, chocolate_types)
)


In [92]:
data_with_categories.head()

Unnamed: 0,Ingredient,Text,Recipe_Index,Rating,Quantity,Unit,subcategory
1,all purpose flour,3.0 cups all purpose flour,AR_1,0.920725,3.0,cup,flour
2,all purpose flour,2.8000000000000003 cups all purpose flour,AR_10,0.905162,2.8,cup,flour
3,all purpose flour,1.1076923076923078 cups all purpose flour,AR_101,0.6,1.107692,cup,flour
4,all purpose flour,3.333333333333333 cups sifted all purpose flour,AR_102,0.9375,3.333333,cup,flour
5,all purpose flour,2.0 cups all purpose flour,AR_103,0.88125,2.0,cup,flour


In [93]:
def categorize_subcategory(subcategory):
    """
    Categorizes an ingredient subcategory as either 'basic' or 'special'.

    Parameters
    ----------
    subcategory : str
        The subcategory of the ingredient (e.g., "flour", "sweetener", "chocolate").

    Returns
    -------
    str
        - "basic" if the subcategory is in ["flour", "sweetener", "fat", "egg"].
        - "special" if the subcategory is in ["chocolate", "other"].
        - None if the subcategory does not match any known category.

    Examples
    --------
    >>> categorize_subcategory("flour")
    'basic'

    >>> categorize_subcategory("chocolate")
    'special'

    >>> categorize_subcategory("spice")
    None
    """
    basic_categories = ["flour", "sweetener", "fat", "egg"]
    special_categories = ["chocolate", "other"]
    if subcategory in basic_categories:
        return "basic"
    elif subcategory in special_categories:
        return "special"

In [94]:
data_with_categories["category"] = data_with_categories["subcategory"].apply(categorize_subcategory)
data_with_categories.head()

Unnamed: 0,Ingredient,Text,Recipe_Index,Rating,Quantity,Unit,subcategory,category
1,all purpose flour,3.0 cups all purpose flour,AR_1,0.920725,3.0,cup,flour,basic
2,all purpose flour,2.8000000000000003 cups all purpose flour,AR_10,0.905162,2.8,cup,flour,basic
3,all purpose flour,1.1076923076923078 cups all purpose flour,AR_101,0.6,1.107692,cup,flour,basic
4,all purpose flour,3.333333333333333 cups sifted all purpose flour,AR_102,0.9375,3.333333,cup,flour,basic
5,all purpose flour,2.0 cups all purpose flour,AR_103,0.88125,2.0,cup,flour,basic


In [96]:
data_with_categories.to_csv("../data/processed/processed_cookie_data.csv")

In [99]:
pd.read_csv("../data/raw/raw_cookie_data.csv", index_col=0)

Unnamed: 0,Ingredient,Text,Recipe_Index,Rating,Quantity,Unit
1,all purpose flour,3.0 cups all purpose flour,AR_1,0.920725,3.000000,cup
2,all purpose flour,2.8000000000000003 cups all purpose flour,AR_10,0.905162,2.800000,cup
3,all purpose flour,1.1076923076923078 cups all purpose flour,AR_101,0.600000,1.107692,cup
4,all purpose flour,3.333333333333333 cups sifted all purpose flour,AR_102,0.937500,3.333333,cup
5,all purpose flour,2.0 cups all purpose flour,AR_103,0.881250,2.000000,cup
...,...,...,...,...,...,...
1295,milk chocolate chip,4.0 cups semi sweet or milk chocolate chips,Misc_46,,4.000000,cup
1298,milk chocolate chip,0.888888888889 cup milk chocolate chips,Misc_63,,0.888800,cup
1299,milk chocolate chip,2.66666666667 cups semisweet or milk chocolate...,Misc_70,,2.666600,cup
1300,milk chocolate chip,2.66666666667 cups milk chocolate chips,Misc_74,,2.666600,cup
