In [75]:
import pandas as pd
import requests
import io

In [76]:
def read_data() -> pd.DataFrame:
    """
    Reads data from a predefined URL into a Pandas DataFrame.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the data from the specified URL.

    Raises:
    -------
    ValueError:
        If the data cannot be retrieved or parsed.

    Example:
    --------
    >>> df = read_data()
    >>> print(df.head())
    """
    url = "https://raw.githubusercontent.com/the-pudding/data/master/cookies/choc_chip_cookie_ingredients.csv"

    try:
        response = requests.get(url)
        response.raise_for_status()
        df = pd.read_csv(io.StringIO(response.text), index_col=0)        
        return df
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

In [77]:
raw_data = read_data()

In [78]:
print("\nUnique Ingredients:", raw_data["Ingredient"].nunique())
print(raw_data["Ingredient"].value_counts())


Unique Ingredients: 68
Ingredient
egg                    212
vanilla                196
all purpose flour      193
baking soda            187
sugar                  175
                      ... 
peanut butter chips      1
xanthan gum              1
white pepper             1
wheat                    1
zucchini                 1
Name: count, Length: 68, dtype: int64


In [79]:
unique_ingredients = raw_data["Ingredient"].unique()

In [80]:
flour_types = []
for i in unique_ingredients:
    if "flour" in i:
        flour_types.append(i)
flour_types

['all purpose flour',
 'bread flour',
 'brown rice flour',
 'cake flour',
 'flour',
 'wheat flour']

In [81]:
chocolate_types = []
for i in unique_ingredients:
    if "chocolate" in i:
        chocolate_types.append(i)
chocolate_types

['chocolate raisin',
 'bittersweet chocolate chip',
 'semisweet chocolate chip',
 'dark chocolate chip',
 'milk chocolate chip',
 'white chocolate chip']

In [82]:
sweetener_types = []
other_sweeteners = ["corn syrup", "honey", "molasses", "applesauce"]
for i in unique_ingredients:
    if "sugar" in i:
        sweetener_types.append(i)

sweetener_types += other_sweeteners
sweetener_types


['sugar',
 'light brown sugar',
 'dark brown sugar',
 'corn syrup',
 'honey',
 'molasses',
 'applesauce']

In [83]:
fat_types = ["butter", "margarine", "shortening", "vegetable oil"]
fat_types

['butter', 'margarine', 'shortening', 'vegetable oil']

In [None]:
non_other_ingredients = chocolate_types + flour_types + sweetener_types + ["egg"] + fat_types
other_ingredients = [item for item in unique_ingredients if item not in non_other_ingredients]

In [None]:
def sub_categorize_ingredient(ingredient, flour_types, sweetener_types, fat_types, chocolate_types):
    """
    Categorizes an ingredient into a specific subcategory based on predefined ingredient types.

    Parameters
    ----------
    ingredient : str
        The ingredient to categorize.
    flour_types : list of str
        A list of ingredients considered as flour.
    sweetener_types : list of str
        A list of ingredients considered as sweeteners.
    fat_types : list of str
        A list of ingredients considered as fats.
    chocolate_types : list of str
        A list of ingredients considered as chocolate.

    Returns
    -------
    str
        The category of the ingredient. Possible values:
        - "flour" if the ingredient is in `flour_types`
        - "sweetener" if the ingredient is in `sweetener_types`
        - "fat" if the ingredient is in `fat_types`
        - "egg" if the ingredient is "egg"
        - "chocolate" if the ingredient is in `chocolate_types`
        - "other" if the ingredient does not match any category

    Examples
    --------
    >>> flour_list = ["all purpose flour", "whole wheat flour"]
    >>> sweetener_list = ["sugar", "honey"]
    >>> fat_list = ["butter", "oil"]
    >>> chocolate_list = ["cocoa powder", "chocolate chips"]
    
    >>> sub_categorize_ingredient("all purpose flour", flour_list, sweetener_list, fat_list, chocolate_list)
    'flour'
    
    >>> sub_categorize_ingredient("honey", flour_list, sweetener_list, fat_list, chocolate_list)
    'sweetener'
    
    >>> sub_categorize_ingredient("egg", flour_list, sweetener_list, fat_list, chocolate_list)
    'egg'
    
    >>> sub_categorize_ingredient("cocoa powder", flour_list, sweetener_list, fat_list, chocolate_list)
    'chocolate'
    
    >>> sub_categorize_ingredient("vanilla extract", flour_list, sweetener_list, fat_list, chocolate_list)
    'other'
    """
    if ingredient in flour_types:
        return "flour"
    elif ingredient in sweetener_types:
        return "sweetener"
    elif ingredient in fat_types:
        return "fat"
    elif ingredient == "egg":
        return "egg"
    elif ingredient == chocolate_types:
        return "chocolate"
    else:
        return "other"

In [15]:
data_with_subcategories = raw_data.copy()

In [None]:
data_with_subcategories['']