<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/all_preivious_%2B_images_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import re
import os
import numpy as np

# Load the cleaned data
df = pd.read_csv('SaudiFoodFile_cleaned.csv')

# Display initial data info
print("Initial data shape:", df.shape)
print("\nFirst few rows:")
print(df[['dish_name', 'image_file']].head())

# Check for missing or non-string values in dish_name
print(f"\nData types: {df['dish_name'].dtype}")
print(f"Missing values in dish_name: {df['dish_name'].isna().sum()}")
print(f"Non-string values sample: {df[df['dish_name'].apply(lambda x: not isinstance(x, str))].head()}")

# Function to create clean image filename from dish name
def create_image_filename(dish_name, original_image_file):
    """
    Create clean image filename based on dish name and handle duplicates
    """
    # Handle NaN or non-string values
    if not isinstance(dish_name, str) or pd.isna(dish_name):
        # Use original image file name as fallback
        base_name = os.path.splitext(os.path.basename(original_image_file))[0]
        clean_name = base_name.lower()
    else:
        # Clean the dish name for filename
        clean_name = dish_name.lower()

    # Remove special characters and replace spaces with underscores
    clean_name = re.sub(r'[^\w\s-]', '', clean_name)
    clean_name = re.sub(r'[-\s]+', '_', clean_name)

    # Keep the file extension from original
    file_extension = os.path.splitext(original_image_file)[1]

    # Create base filename
    base_filename = f"{clean_name}{file_extension}"

    return base_filename

# Apply image filename creation
print("\nCreating standardized image filenames...")

# Create base image filenames
df['base_image_file'] = df.apply(
    lambda row: create_image_filename(row['dish_name'], row['image_file']),
    axis=1
)

# Handle duplicates by adding incremental IDs
print("\nHandling duplicate image filenames...")

# Count occurrences and add IDs to duplicates
duplicate_count = {}
df['new_image_file'] = ""

for idx, row in df.iterrows():
    base_name = row['base_image_file']

    if base_name in duplicate_count:
        duplicate_count[base_name] += 1
        # Add ID to duplicate (before extension)
        name_without_ext, ext = os.path.splitext(base_name)
        final_name = f"{name_without_ext}_{duplicate_count[base_name]}{ext}"
    else:
        duplicate_count[base_name] = 1
        final_name = base_name

    df.at[idx, 'new_image_file'] = final_name

# Show before and after examples
print("\nImage filename standardization examples:")
sample_size = min(20, len(df))
for i in range(sample_size):
    print(f"Dish: {df['dish_name'].iloc[i]}")
    print(f"Original image: {df['image_file'].iloc[i]}")
    print(f"New image: {df['new_image_file'].iloc[i]}")
    print("-" * 60)

# Show duplicates that were handled
duplicates = {name: count for name, count in duplicate_count.items() if count > 1}
if duplicates:
    print(f"\nFound {len(duplicates)} image names with duplicates:")
    for name, count in list(duplicates.items())[:15]:
        print(f"  - {name}: {count} occurrences")

    # Show specific examples of duplicate resolution
    print("\nExamples of duplicate resolution:")
    for duplicate_name in list(duplicates.keys())[:10]:
        matching_rows = df[df['base_image_file'] == duplicate_name]
        print(f"\n{duplicate_name}:")
        for _, row in matching_rows.iterrows():
            print(f"  - {row['new_image_file']} (from: {row['dish_name']})")
else:
    print("\nNo duplicate image names found!")

# Create the final dataframe
df_final = df.copy()
df_final['image_file_original'] = df['image_file']
df_final['image_file'] = df['new_image_file']

# Drop temporary columns
df_final = df_final.drop(['base_image_file', 'new_image_file'], axis=1)

print(f"\nFinal data shape: {df_final.shape}")

# Save to new CSV
output_filename = 'SaudiFoodFile_final_cleaned.csv'
df_final.to_csv(output_filename, index=False)
print(f"\nFinal cleaned data saved to: {output_filename}")

# Summary statistics
print("\n" + "="*80)
print("IMAGE FILENAME STANDARDIZATION SUMMARY")
print("="*80)
print(f"Total dishes: {len(df_final)}")
print(f"Unique original image names: {df['image_file'].nunique()}")
print(f"Unique new image names: {df_final['image_file'].nunique()}")
print(f"Duplicates handled: {len(duplicates)}")

# Show most common dish names and their image files
print("\nMost common dish names and their new image files:")
common_dishes = df_final['dish_name'].value_counts().head(15)
for dish, count in common_dishes.items():
    matching_images = df_final[df_final['dish_name'] == dish]['image_file'].tolist()
    print(f"\n{dish} (appears {count} times):")
    for img in matching_images:
        print(f"  - {img}")

# Show problematic cases (very short names or empty names)
print("\nChecking for problematic dish names:")
short_names = df_final[df_final['dish_name'].str.len() < 3] if 'dish_name' in df_final.columns else pd.DataFrame()
if len(short_names) > 0:
    print("Very short dish names found:")
    for _, row in short_names.iterrows():
        print(f"  - '{row['dish_name']}' -> {row['image_file']}")

# Show the complete mapping for verification
print("\nComplete filename mapping (first 30 entries):")
print("Dish Name -> Original Image -> New Image")
for i in range(min(30, len(df_final))):
    dish_name = df_final['dish_name'].iloc[i] if isinstance(df_final['dish_name'].iloc[i], str) else "MISSING_NAME"
    print(f"{dish_name} -> {df_final['image_file_original'].iloc[i]} -> {df_final['image_file'].iloc[i]}")

# Additional: Show any rows with missing dish names
missing_dish_names = df_final[df_final['dish_name'].isna()]
if len(missing_dish_names) > 0:
    print(f"\nWARNING: Found {len(missing_dish_names)} rows with missing dish names:")
    for idx, row in missing_dish_names.iterrows():
        print(f"  - Row {idx}: Original image: {row['image_file_original']}, New image: {row['image_file']}")

Initial data shape: (285, 5)

First few rows:
                     dish_name  \
0                Almond Coffee   
1              Hejaz Shakshuka   
2  Meat Kabsa And Daqoos Salad   
3                        Kleja   
4                Chicken Kabsa   

                                          image_file  
0        images/traditional_hejazi_almond_coffee.jpg  
1  images/Shakshuka_Hejazia_for_Saudi_National_Da...  
2       images/Saudi_meat_kabsa_and_dakous_salad.jpg  
3  images/how_to_work_the_college_of_Saudi Arabia...  
4         images/Kabsa_chicken_style_Saudi_style.jpg  

Data types: object
Missing values in dish_name: 1
Non-string values sample:     dish_name classifications         image_file scrape_date  \
131       NaN    unclassified  images/creamy.png    30-09-25   

    dish_name_original  
131             Creamy  

Creating standardized image filenames...

Handling duplicate image filenames...

Image filename standardization examples:
Dish: Almond Coffee
Original image: imag