In [10]:
import pandas as pd
import requests
import io

In [11]:
def read_data() -> pd.DataFrame:
    """
    Reads data from a predefined URL into a Pandas DataFrame.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the data from the specified URL.

    Raises:
    -------
    ValueError:
        If the data cannot be retrieved or parsed.

    Example:
    --------
    >>> df = read_data()
    >>> print(df.head())
    """
    url = "https://raw.githubusercontent.com/the-pudding/data/master/cookies/choc_chip_cookie_ingredients.csv"

    try:
        response = requests.get(url)
        response.raise_for_status()
        df = pd.read_csv(io.StringIO(response.text), index_col=0)        
        return df
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

In [12]:
raw_data = read_data()

In [16]:
print("\nUnique Ingredients:", raw_data["Ingredient"].nunique())
print(raw_data["Ingredient"].value_counts())


Unique Ingredients: 68
Ingredient
egg                    212
vanilla                196
all purpose flour      193
baking soda            187
sugar                  175
                      ... 
peanut butter chips      1
xanthan gum              1
white pepper             1
wheat                    1
zucchini                 1
Name: count, Length: 68, dtype: int64


In [22]:
unique_ingredients = raw_data["Ingredient"].unique()
unique_ingredients

array(['all purpose flour', 'almond extract', 'almonds', 'applesauce',
       'baking powder', 'baking soda', 'bourbon', 'bread flour',
       'brown rice flour', 'butter', 'cake flour', 'cake mix', 'cinnamon',
       'coconut', 'coconut extract', 'cookie mix', 'coriander',
       'corn syrup', 'cornstarch', 'cream', 'crispy rice', 'egg',
       'espresso', 'flour', 'graham cracker', 'honey', 'instant coffee',
       'lemon juice', 'liquer', 'macadmia', 'maple', 'margarine',
       'marshmallows', 'milk', 'chocolate raisin', 'molasses', 'nestle',
       'nutmeg', 'nuts', 'oat', 'peanut butter', 'peanut butter chips',
       'pecan', 'pudding mix', 'raisins', 'salt', 'shortening',
       'sour cream', 'sugar', 'tartar', 'toffee', 'vanilla',
       'vegetable oil', 'vinegar', 'walnut', 'water', 'wheat',
       'wheat flour', 'white pepper', 'xanthan gum', 'zucchini',
       'light brown sugar', 'dark brown sugar',
       'bittersweet chocolate chip', 'semisweet chocolate chip',
       'd

In [21]:
flour_types = []
for i in unique_ingredients:
    if "flour" in i:
        flour_types.append(i)

flour_types

['all purpose flour',
 'bread flour',
 'brown rice flour',
 'cake flour',
 'flour',
 'wheat flour']

In [23]:
chocolate_types = []
for i in unique_ingredients:
    if "chocolate" in i:
        chocolate_types.append(i)
chocolate_types

['chocolate raisin',
 'bittersweet chocolate chip',
 'semisweet chocolate chip',
 'dark chocolate chip',
 'milk chocolate chip',
 'white chocolate chip']

In [None]:
sweetener_types = []
other_sweeteners = ["corn syrup", "honey", "molasses", "applesauce"]
for i in unique_ingredients:
    if "sugar" in i:
        sweetener_types.append(i)
    elif any(sweetener in i for sweetener in other_sweeteners):
        sweetener_types.append(i)
sweetener_types
# any(substring in string for substring in substring_list)


['corn syrup',
 'honey',
 'molasses',
 'sugar',
 'light brown sugar',
 'dark brown sugar']

In [None]:
fats = ["corn syrup", "honey", "molasses", "applesauce"]
for i in unique_ingredients:
    if any(fat in i for fat in fats):
        fats.append(i)
fats

In [50]:
sub = chocolate_types + flour_types + sweetener_types + ["egg"]

In [51]:
# left_over = unique_ingredients.remove(chocolate_types)
left_over = [item for item in unique_ingredients if item not in sub]
left_over

['almond extract',
 'almonds',
 'applesauce',
 'baking powder',
 'baking soda',
 'bourbon',
 'butter',
 'cake mix',
 'cinnamon',
 'coconut',
 'coconut extract',
 'cookie mix',
 'coriander',
 'cornstarch',
 'cream',
 'crispy rice',
 'espresso',
 'graham cracker',
 'instant coffee',
 'lemon juice',
 'liquer',
 'macadmia',
 'maple',
 'margarine',
 'marshmallows',
 'milk',
 'nestle',
 'nutmeg',
 'nuts',
 'oat',
 'peanut butter',
 'peanut butter chips',
 'pecan',
 'pudding mix',
 'raisins',
 'salt',
 'shortening',
 'sour cream',
 'tartar',
 'toffee',
 'vanilla',
 'vegetable oil',
 'vinegar',
 'walnut',
 'water',
 'wheat',
 'white pepper',
 'xanthan gum',
 'zucchini']

In [15]:
data_with_subcategories = raw_data.copy()

In [None]:
data_with_subcategories['']