<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/good_standard_dish_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re

# Load the data
df = pd.read_csv('SaudiFoodFile_english_FIXED.csv')

# Display initial data info
print("Initial data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Task 1: Clean dish names - remove extra descriptions
def clean_dish_name(name):
    """
    Remove extra descriptions from dish names like 'for Saudi National Day',
    'how to make', 'Saudi style', etc.
    """
    # Common patterns to remove
    patterns_to_remove = [
        r'for saudi national day',
        r'how to make',
        r'saudi style',
        r'saudi',
        r'traditional',
        r'the saudi',
        r'method for',
        r'according to',
        r'with.*',
        r'for.*',
        r'the hijazi way',
        r'hijazi',
        r'recipe',
        r'easy',
        r'authentic',
        r'copycat',
        r'slow-?roast',
        r'no bake',
        r'healthy',
        r'vegetarian',
        r'stuffed',
        r'baked',
        r'grilled',
        r'roasted',
        r'creamy',
        r'spiced',
        r'middle eastern'
    ]

    cleaned_name = name.lower().strip()

    # Remove patterns
    for pattern in patterns_to_remove:
        cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE)

    # Remove extra spaces and punctuation
    cleaned_name = re.sub(r'[^\w\s]', ' ', cleaned_name)  # Remove punctuation
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name)  # Remove extra spaces
    cleaned_name = cleaned_name.strip()

    # Remove common measurement/portion descriptions
    portion_patterns = [
        r'\([^)]*\)',  # Remove anything in parentheses
        r'\bwhole grain\b',
        r'\bhalf a piece\b',
        r'\bhalf piece\b',
        r'\bquarter\b',
        r'\bone person\b',
        r'\bperson\b',
        r'\bplain\b',
        r'\bwith rice\b',
        r'\bwithout rice\b'
    ]

    for pattern in portion_patterns:
        cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE)

    # Final cleanup
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name).strip()

    # Title case for consistency
    cleaned_name = cleaned_name.title()

    return cleaned_name

# Task 2: Standardize dish name variations
def standardize_dish_name(name):
    """
    Standardize variations of dish names (kabsa/kabsah/kbsa -> kabsa)
    """
    standardization_map = {
        r'\bkabsah?\b': 'Kabsa',
        r'\bkbsa\b': 'Kabsa',
        r'\bkleija\b': 'Kleja',
        r'\bkulaija\b': 'Kleja',
        r'\bklija\b': 'Kleja',
        r'\bshaksoka\b': 'Shakshuka',
        r'\bshakshuka\b': 'Shakshuka',
        r'\bshaksuka\b': 'Shakshuka',
        r'\bbasbousa\b': 'Basbousa',
        r'\bbasbosa\b': 'Basbousa',
        r'\bjareesh\b': 'Jareesh',
        r'\bjarish\b': 'Jareesh',
        r'\bgreesh\b': 'Jareesh',
        r'\bgroats\b': 'Jareesh',
        r'\bmaqshoosh\b': 'Maqshush',
        r'\bmaqshush\b': 'Maqshush',
        r'\bmutabbaq\b': 'Mutabak',
        r'\bmutabak\b': 'Mutabak',
        r'\bsaleeq\b': 'Saleek',
        r'\bsaliq\b': 'Saleek',
        r'\bsaleek\b': 'Saleek',
        r'\bsulait?\b': 'Saleek',
        r'\bmaamoul\b': 'Mamoul',
        r'\bmamoul\b': 'Mamoul',
        r'\bmadhbi\b': 'Madhbi',
        r'\bmadghog\b': 'Madhghut',
        r'\bmadjou?h\b': 'Madhghut',
        r'\bmadfoon\b': 'Madfun',
        r'\bmadfoun\b': 'Madfun',
        r'\bmandi\b': 'Mandi',
        r'\bzurbian\b': 'Zurbian',
        r'\bzerbian\b': 'Zurbian',
        r'\bshrimp\b': 'Shrimp',
        r'\bshurbian\b': 'Shrimp',
        r'\bsambosa\b': 'Sambusa',
        r'\bsambousek\b': 'Sambusa',
        r'\bsamosa\b': 'Sambusa',
        r'\bmagloba\b': 'Maqluba',
        r'\bmaqluba\b': 'Maqluba',
        r'\bmakloubeh\b': 'Maqluba',
        r'\bmoussaka\b': 'Musaqa',
        r'\bmoussaqa\b': 'Musaqa',
        r'\bmusakaa\b': 'Musaqa',
        r'\bmolokhia\b': 'Mulukhiyah',
        r'\bmolokhiya\b': 'Mulukhiyah',
        r'\bmulukhiyah\b': 'Mulukhiyah',
        r'\bmargog\b': 'Marqouq',
        r'\bmarqouk\b': 'Marqouq',
        r'\bmarqooq\b': 'Marqouq',
        r'\bmatazeez\b': 'Mataziz',
        r'\bmogalgal\b': 'Muqalqal',
        r'\bmqalqal\b': 'Muqalqal',
        r'\bhemees\b': 'Hamees',
        r'\bhemen\b': 'Hamees',
        r'\bmohalabiya\b': 'Muhalabiya',
        r'\bmohala\b': 'Muhalabiya',
        r'\bkunafa\b': 'Kunafa',
        r'\bknafeh\b': 'Kunafa',
        r'\bsabeeb\b': 'Sabeeb',
        r'\bsabib\b': 'Sabeeb',
        r'\btaheena\b': 'Tahini',
        r'\btainna\b': 'Tahini',
        r'\btahini\b': 'Tahini',
        r'\bfatteh\b': 'Fatteh',
        r'\bfateh\b': 'Fatteh',
        r'\bfreekeh\b': 'Freekeh',
        r'\bfreekey\b': 'Freekeh',
        r'\bhashweh\b': 'Hashu',
        r'\bhashu\b': 'Hashu',
        r'\bmujadara\b': 'Mujaddara',
        r'\bmujaddara\b': 'Mujaddara',
        r'\bzaatar\b': 'Zaatar',
        r'\bza\'atar\b': 'Zaatar'
    }

    standardized_name = name
    for pattern, replacement in standardization_map.items():
        standardized_name = re.sub(pattern, replacement, standardized_name, flags=re.IGNORECASE)

    return standardized_name

# Apply cleaning and standardization
print("\nApplying data cleaning...")

# Create cleaned dish names
df['cleaned_dish_name'] = df['dish_name'].apply(clean_dish_name)
df['standardized_dish_name'] = df['cleaned_dish_name'].apply(standardize_dish_name)

# Show before and after examples
print("\nName cleaning examples:")
sample_size = min(10, len(df))
for i in range(sample_size):
    print(f"Original: {df['dish_name'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_dish_name'].iloc[i]}")
    print(f"Standardized: {df['standardized_dish_name'].iloc[i]}")
    print("-" * 50)

# Show most common dish names after standardization
print("\nMost common standardized dish names:")
print(df['standardized_dish_name'].value_counts().head(20))

# Check for remaining variations
print("\nChecking for remaining variations (sample):")
unique_names = df['standardized_dish_name'].unique()
for name in sorted(unique_names)[:30]:  # Show first 30
    print(f"  - {name}")

# Save the cleaned data
df_cleaned = df.copy()
# You can choose to replace the original dish_name or keep both
df_cleaned['dish_name_original'] = df['dish_name']
df_cleaned['dish_name'] = df['standardized_dish_name']

# Drop temporary columns
df_cleaned = df_cleaned.drop(['cleaned_dish_name', 'standardized_dish_name'], axis=1)

print(f"\nFinal data shape: {df_cleaned.shape}")
print("\nFirst few rows of cleaned data:")
print(df_cleaned[['dish_name_original', 'dish_name']].head(15))

# Save to new CSV file
output_filename = 'SaudiFoodFile_cleaned.csv'
df_cleaned.to_csv(output_filename, index=False)
print(f"\nCleaned data saved to: {output_filename}")

# Additional analysis: Show name standardization results
print("\n" + "="*80)
print("NAME STANDARDIZATION SUMMARY")
print("="*80)

# Group similar names to show standardization effect
name_groups = {}
for orig, new in zip(df['dish_name'], df_cleaned['dish_name']):
    if new not in name_groups:
        name_groups[new] = []
    if orig not in name_groups[new]:
        name_groups[new] = sorted(name_groups[new] + [orig])

print("\nStandardization groups (showing first 15 groups):")
count = 0
for standardized_name, original_names in name_groups.items():
    if len(original_names) > 1:  # Only show names that had variations
        print(f"\n{standardized_name}:")
        for orig_name in original_names:
            print(f"  - {orig_name}")
        count += 1
        if count >= 15:
            break

Initial data shape: (285, 4)

First few rows:
                                dish_name  \
0        Traditional Hijazi almond coffee   
1  Hejaz Shakshuka for Saudi National Day   
2       Saudi meat kabsa and daqoos salad   
3                How to make Saudi kleija   
4               Saudi style chicken kabsa   

                                     classifications  \
0                         loafs | cinnamon | coconut   
1                               egg | cheese | bread   
2  tomatoes | hot green pepper | salt | cumin | r...   
3    dates | haw | cinnamon | ginger | summit | eggs   
4  saffron | haw | cinnamon | mixed spices | whit...   

                                          image_file scrape_date  
0        images/traditional_hejazi_almond_coffee.jpg    30-09-25  
1  images/Shakshuka_Hejazia_for_Saudi_National_Da...    30-09-25  
2       images/Saudi_meat_kabsa_and_dakous_salad.jpg    30-09-25  
3  images/how_to_work_the_college_of_Saudi Arabia...    30-09-25  
4         i