In [32]:
import pandas as pd
from collections import defaultdict, Counter

In [33]:
df_raw = pd.read_csv('../data/raw_menu_table.csv')
df_raw = df_raw.drop(columns=['Unnamed: 0'], errors='ignore')

In [34]:
df_raw.head()

Unnamed: 0,restaurant_id,menu_id,category,food_name,food_description,food_price
0,1,1,Smoothies,J’ Ti`’z Smoothie,"Tropical fruit blend, dragon fruit mix, mango,...",5.49
1,1,1,Smoothies,Ti`’z Fruity Thang Smoothie,"Tropical fruit blend, dragon fruit mix, craisi...",5.49
2,1,1,Smoothies,Ashunti`Way Smoothie,"Fruit n greens, mango bananas, tropical fruit ...",5.49
3,1,1,Smoothies,Jimmy Jam Smoothie,"Berries n kale, strawberries, bananas, blueber...",5.49
4,1,1,Smoothies,J’ Ti`’z Tastey Ice Smoothie,"Tropical fruit blend, dragon fruit, pineapple,...",5.49


In [21]:
# Assuming df_raw is already loaded
# Remove 'Unnamed: 0' column if it exists
df_raw = df_raw.drop(columns=['Unnamed: 0'], errors='ignore')

# Define categories list from df_raw
categories = [i for i in df_raw['category'].unique()]

# Meal types and associated keywords
meal_type_keywords = {
    'breakfast': ['Breakfast', 'Morning', 'Eggs', 'Pancake'],
    'snack': ['Appetizer', 'Starter', 'Side', 'Small'],
    'dinner-main': ['Pizza', 'Sandwich', 'Feast', 'Lunch', 'Entree'],
    'dinner-side': ['Soup', 'Salad', 'Side'],
    'dessert': ['Dessert', 'Sweets', 'Cake', 'Pastry'],
    'drinks': ['Coffee', 'Smoothie', 'Tea', 'Espresso']
}

# Calories categorization based on typical assumptions
calorie_keywords = {
    'high': ['Stuffed', 'Signature', 'Pizza', 'Feast', 'Dessert'],
    'medium': ['Sandwich', 'Soup', 'Lunch', 'Grilled'],
    'low': ['Tea', 'Smoothie', 'Salad', 'Keto']
}

# Function to categorize meal_type and calorie
def categorize_item(item, type_keywords, calorie_keywords):
    # Determine meal type
    meal_type = 'Unknown'
    for m_type, keywords in type_keywords.items():
        if any(keyword in item for keyword in keywords):
            meal_type = m_type
            break

    # Determine calorie level
    calorie = 'Unknown'
    for cal_level, keywords in calorie_keywords.items():
        if any(keyword in item for keyword in keywords):
            calorie = cal_level
            break

    return meal_type, calorie

# Generate the expected_result dictionary
expected_result = defaultdict(list)
for category in categories:
    meal_type, calorie = categorize_item(category, meal_type_keywords, calorie_keywords)
    expected_result[category] = (meal_type, calorie)

# Define a function to apply meal_type and calories based on category
def apply_labels(category):
    meal_type, calorie = expected_result.get(category, ('Unknown', 'Unknown'))
    return pd.Series([meal_type, calorie])

# Apply the function to add new columns in df_raw
df_raw[['meal_type', 'calories']] = df_raw['category'].apply(apply_labels)

# Print current rows to verify
print(df_raw.head())

# Calculate the number of entries with both meal_type and calories labeled
labeled_entries = df_raw[(df_raw['meal_type'] != 'Unknown') & (df_raw['calories'] != 'Unknown')].shape[0]

# Calculate the total number of entries
total_entries = df_raw.shape[0]

# Calculate the ratio
ratio = labeled_entries / total_entries

# Print the ratio
print(f"Ratio of labeled entries (both meal_type and calories) to total entries: {ratio:.2%}")


   restaurant_id  menu_id   category                     food_name  \
0              1        1  Smoothies             J’ Ti`’z Smoothie   
1              1        1  Smoothies   Ti`’z Fruity Thang Smoothie   
2              1        1  Smoothies          Ashunti`Way Smoothie   
3              1        1  Smoothies            Jimmy Jam Smoothie   
4              1        1  Smoothies  J’ Ti`’z Tastey Ice Smoothie   

                                    food_description  food_price meal_type  \
0  Tropical fruit blend, dragon fruit mix, mango,...        5.49    drinks   
1  Tropical fruit blend, dragon fruit mix, craisi...        5.49    drinks   
2  Fruit n greens, mango bananas, tropical fruit ...        5.49    drinks   
3  Berries n kale, strawberries, bananas, blueber...        5.49    drinks   
4  Tropical fruit blend, dragon fruit, pineapple,...        5.49    drinks   

  calories  
0      low  
1      low  
2      low  
3      low  
4      low  
Ratio of labeled entries (both m

In [None]:
import re

df_raw = pd.read_csv('../data/Labelled_Menu_Table.csv', encoding='ISO-8859-1')
df_raw = df_raw.drop(columns=['Unnamed: 0'], errors='ignore')
df_clean = df_raw[(df_raw['estimated_calories'] != 0) & (df_raw['food_type'] != "Unknown")]

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s\-\'\"]', '', text)

df_clean['food_name'] = df_clean['food_name'].astype(str).apply(clean_text)