In [8]:
import pandas as pd

# Load the dataset from your data folder
df = pd.read_csv('data/recipes_600dataset.csv')

# Display the first 5 rows to see what the data looks like
df.head()

# Get a summary of the data (column names, data types, missing values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641 entries, 0 to 640
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   recipe_name  641 non-null    object 
 1   prep_time    641 non-null    object 
 2   cook_time    641 non-null    object 
 3   total_time   641 non-null    object 
 4   ingredients  641 non-null    object 
 5   directions   641 non-null    object 
 6   rating       641 non-null    float64
 7   nutrition    641 non-null    object 
 8   img_src      641 non-null    object 
dtypes: float64(1), object(8)
memory usage: 45.2+ KB


In [9]:
import re
import pandas as pd

In [10]:
sample_nutrition_string = df['nutrition'].iloc[0]
nutrients = re.findall(r'([A-Za-z\s]+)\s(\d+g|\d+mg)', sample_nutrition_string)
nutrition_dict = {nutrient.strip(): value for nutrient, value in nutrients}
print(nutrition_dict)

{'Total Fat': '19g', 'Saturated Fat': '9g', 'Cholesterol': '31mg', 'Sodium': '124mg', 'Total Carbohydrate': '52g', 'Dietary Fiber': '3g', 'Protein': '2g', 'Vitamin C': '5mg', 'Calcium': '23mg', 'Iron': '1mg', 'Potassium': '156mg'}


In [11]:
sample_ingredients_string = df['ingredients'].iloc[0]
ingredients_list = sample_ingredients_string.split(', ')
print(ingredients_list)

['8 small Granny Smith apples', 'or as needed', '½ cup unsalted butter', '3 tablespoons all-purpose flour', '½ cup white sugar', '½ cup packed brown sugar', '¼ cup water', '1 (9 inch) double-crust pie pastry', 'thawed']


DEFINE CLEANING AND PARSING FUNCTIONS

In [16]:
import re

def convert_to_minutes(time_str):
    """Convert a time string to total minutes."""
    if not isinstance(time_str, str) or not time_str.strip():
        return 0
    
    time_str = time_str.lower().strip()
    total_minutes = 0
    
    # Handle text representations
    time_str = time_str.replace('hours', 'h').replace('hour', 'h').replace('hrs', 'h').replace('hr', 'h')
    time_str = time_str.replace('minutes', 'm').replace('minute', 'm').replace('mins', 'm').replace('min', 'm')
    
    # Find all patterns like "1h", "30m", "2.5h", etc.
    patterns = re.findall(r'(\d*\.?\d+)\s*([hm]?)', time_str)
    
    for value, unit in patterns:
        try:
            num_value = float(value)
            if unit == 'h':
                total_minutes += int(num_value * 60)
            else:  # 'm' or no unit (assume minutes)
                total_minutes += int(num_value)
        except ValueError:
            continue
    
    return total_minutes

In [18]:
# Test cases for the time conversion function
test_cases = [
    '1 hrs 30 mins',
    '45 mins',
    '2 hrs',
    '1h30m',
    '90',
    '1 hour 30 minutes',
    '2.5 hrs'
]

for test in test_cases:
    minutes = convert_to_minutes(test)
    print(f"'{test}' -> {minutes} minutes")

'1 hrs 30 mins' -> 90 minutes
'45 mins' -> 45 minutes
'2 hrs' -> 120 minutes
'1h30m' -> 90 minutes
'90' -> 90 minutes
'1 hour 30 minutes' -> 90 minutes
'2.5 hrs' -> 150 minutes


In [30]:
def parse_nutrition(nutrition_str):
    """
    Parses the nutrition string into a dictionary of key nutrient values.
    Extracts only the numerical part of the value (e.g., '19' from '19g').
    """
    if not isinstance(nutrition_str, str):
        return {}
        nutrients = {}
        # This pattern finds nutrient names and their corresponding values (e.g., "Total Fat 19g")
        pattern = re.compile(r'([A-Za-z\s]+)\s([\d.]+)(g|mg)')
        matches = pattern.findall(nutrition_str)

    for match in matches:
        # match is a tuple, e.g., ('Total Fat', '19', 'g')
        nutrient_name = match.strip()
        value = float(match)
        nutrients[nutrient_name] = value
            
    return nutrients

In [34]:
def parse_nutrition(nutrition_str):
    """
    Parses the nutrition string into a dictionary of key nutrient values.
    Extracts only the numerical part of the value (e.g., '19' from '19g').
    """
    if not isinstance(nutrition_str, str):
        return {}
    
    nutrients = {}
    # This pattern finds nutrient names and their corresponding values (e.g., "Total Fat 19g")
    pattern = re.compile(r'([A-Za-z\s]+)\s([\d.]+)(g|mg)')
    matches = pattern.findall(nutrition_str)

    for match in matches:
        # match is a tuple of (nutrient_name, value, unit)
        nutrient_name = match[0].strip()
        value = float(match[1])
        nutrients[nutrient_name] = value
            
    return nutrients

# Test the function with a sample nutrition string
print(parse_nutrition(df['nutrition'].iloc[0]))

{'Total Fat': 19.0, 'Saturated Fat': 9.0, 'Cholesterol': 31.0, 'Sodium': 124.0, 'Total Carbohydrate': 52.0, 'Dietary Fiber': 3.0, 'Protein': 2.0, 'Vitamin C': 5.0, 'Calcium': 23.0, 'Iron': 1.0, 'Potassium': 156.0}


LOAD AND TRANSFORM THE DATASET

In [36]:
# Load the dataset from the 'data' subfolder
df = pd.read_csv('data/recipes_600dataset.csv')
print("Dataset loaded successfully.")

# Define a simple ingredients parser (was missing)
def parse_ingredients(ingredients_str):
    """
    Parse the ingredients string into a list of ingredient segments.
    Uses comma splitting and trims whitespace. Returns an empty list for non-strings.
    """
    if not isinstance(ingredients_str, str):
        return []
    # split on commas, strip whitespace, filter out empty tokens
    parts = [part.strip() for part in ingredients_str.split(',') if part and part.strip()]
    return parts

# --- Apply Time Conversion ---
df['prep_time_mins'] = df['prep_time'].apply(convert_to_minutes)
df['cook_time_mins'] = df['cook_time'].apply(convert_to_minutes)
df['total_time_mins'] = df['total_time'].apply(convert_to_minutes)

# --- Apply Ingredients Parsing ---
df['ingredients_list'] = df['ingredients'].apply(parse_ingredients)

# --- Apply Nutrition Parsing and Create New Columns ---
nutrition_df = df['nutrition'].apply(parse_nutrition).apply(pd.Series)

# Rename columns to be more script-friendly
nutrition_df = nutrition_df.rename(columns={
    'Total Fat': 'fat_total_g',
    'Saturated Fat': 'fat_saturated_g',
    'Cholesterol': 'cholesterol_mg',
    'Sodium': 'sodium_mg',
    'Total Carbohydrate': 'carbs_total_g',
    'Dietary Fiber': 'carbs_fiber_g',
    'Protein': 'protein_g',
    'Total Sugars': 'sugar_g',
    'Vitamin C': 'vitamin_c_mg',
    'Calcium': 'calcium_mg',
    'Iron': 'iron_mg',
    'Potassium': 'potassium_mg'
})

# Join the new, clean nutrition columns back to the main DataFrame
df = pd.concat([df, nutrition_df], axis=1)

print("\nData cleaning and feature engineering complete.")

# 3. DISPLAY A SAMPLE OF THE CLEANED DATAFRAME
# =============================================

# Define columns to display to verify the transformations
display_columns = [
    'recipe_name',
    'total_time_mins',
    'ingredients_list',
    'fat_total_g',
    'protein_g',
    'carbs_total_g',
    'sugar_g'
]

print("\nDisplaying a sample of the new, cleaned DataFrame:")
print(df[display_columns].head())

Dataset loaded successfully.

Data cleaning and feature engineering complete.

Displaying a sample of the new, cleaned DataFrame:
                         recipe_name  total_time_mins  \
0          Apple Pie by Grandma Ople               90   
1        Sarah's Homemade Applesauce               25   
2                        Apple Crisp               75   
3                  Apple Pie Filling              160   
4  Easy Apple Crisp with Oat Topping               60   

                                    ingredients_list  fat_total_g  protein_g  \
0  [8 small Granny Smith apples, or as needed, ½ ...         19.0        2.0   
1  [4  apples - peeled, cored and chopped, ¾ cup ...          0.0        0.0   
2  [10 cups all-purpose apples, peeled, cored and...          8.0        2.0   
3  [18 cups thinly sliced apples, 3 tablespoons l...          0.0        0.0   
4  [6  apples - peeled, cored, and sliced, 2 tabl...         25.0        5.0   

   carbs_total_g  sugar_g  
0           52.0  