<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/renamed273.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import zipfile
import os
import pandas as pd

# Remove the current corrupted images folder
if os.path.exists('images'):
    import shutil
    shutil.rmtree('images')
    print("Removed corrupted images folder")

# Upload your original ZIP file again
print("Please upload your ORIGINAL ZIP file with Arabic-named images...")
uploaded = files.upload()

# Get the uploaded ZIP filename
zip_filename = list(uploaded.keys())[0]

# Extract with proper encoding for Arabic filenames
def extract_arabic_zip(zip_path, extract_to):
    os.makedirs(extract_to, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Try different encodings for Arabic filenames
        encodings = ['utf-8', 'cp1256', 'iso-8859-6']

        for file_info in zip_ref.infolist():
            original_name = file_info.filename

            # Skip directory entries
            if original_name.endswith('/'):
                continue

            # Try to decode with different encodings
            decoded_name = None
            for encoding in encodings:
                try:
                    decoded_name = original_name.encode('cp437').decode(encoding)
                    break
                except:
                    continue

            if decoded_name is None:
                decoded_name = original_name  # Fallback to original

            # Extract just the filename (remove path if any)
            basename = os.path.basename(decoded_name)
            extract_path = os.path.join(extract_to, basename)

            try:
                with zip_ref.open(file_info) as source:
                    with open(extract_path, 'wb') as target:
                        target.write(source.read())
                print(f"✓ Extracted: {basename}")
            except Exception as e:
                print(f"✗ Error extracting {basename}: {e}")

# Extract the files
print("Extracting with Arabic filename support...")
extract_arabic_zip(zip_filename, 'images')
print("Extraction completed!")

Please upload your ORIGINAL ZIP file with Arabic-named images...


Saving images.zip to images.zip
Extracting with Arabic filename support...
✓ Extracted: 30_ramadan_recipes_-_fufu_s_kitchen.jpg
✓ Extracted: Afees.jpg
✓ Extracted: aish_el_saraya.jpg
✓ Extracted: almond_cake.jpg
✓ Extracted: almond_coffee_cake.jpg
✓ Extracted: anise_biscotti_-_fufu_s_kitchen.jpg
✓ Extracted: arabic_rice.jpg
✓ Extracted: arabic_salad.png
✓ Extracted: arayes.jpg
✓ Extracted: baba_ghanoush_eggplant_dip_-_fufu_s_kitchen.jpg
✓ Extracted: baba_ghanoush_roasted_eggplant_dip_.png
✓ Extracted: baked_kofta_kebab.png
✓ Extracted: baked_lamb_and_vegetables.jpg
✓ Extracted: baked_spaghetti_macarona_bi_lahmeh_.png
✓ Extracted: baked_turkey_kebabs_-_fufu_s_kitchen.jpg
✓ Extracted: baked_za_atar_chicken.png
✓ Extracted: barazek_sesame_seed_cookies_-_fufu_s_kitchen.jpg
✓ Extracted: basbousa_middle_eastern_semolina_cake_.jpg
✓ Extracted: batata_harra_spiced_potatoes_-_fufu_s_kitchen.jpg
✓ Extracted: bechamel_pasta_bake.jpg
✓ Extracted: beef_kofta_kebab.jpg
✓ Extracted: beef_sambousek.pn

In [2]:
# Check if Arabic files extracted properly
print("=== CHECKING EXTRACTED FILES ===")
if os.path.exists('images'):
    arabic_files = os.listdir('images')
    print(f"Total files extracted: {len(arabic_files)}")

    # Show Arabic files
    print("\nSample Arabic files (first 10):")
    for i, file in enumerate(arabic_files[:10]):
        print(f"  {i+1}. {file}")

    # Check if we have proper Arabic characters
    arabic_chars = any(any('\u0600' <= char <= '\u06FF' for char in file) for file in arabic_files)
    if arabic_chars:
        print("✓ Arabic characters detected - extraction successful!")
    else:
        print("⚠️ No Arabic characters found - extraction may have issues")
else:
    print("❌ Images folder not found")

=== CHECKING EXTRACTED FILES ===
Total files extracted: 333

Sample Arabic files (first 10):
  1. chipotle_chicken_bowl_copycat_recipe_.png
  2. malfoof_stuffed_cabbage_leaves_.png
  3. date_brownies_-_fufu_s_kitchen.jpg
  4. مرقوق.jpg
  5. مقلوبة_الدجاج.png
  6. Margoog.jpg
  7. شطة_حار.jpg
  8. ربع_دجاج_شواية.jpg
  9. الكليجة_السعودية.jpg
  10. سلطة_خضراء.jpg
✓ Arabic characters detected - extraction successful!


In [10]:
import pandas as pd
import os

print("=== CURRENT STATE ===")
# Check what files we have
if os.path.exists('images'):
    current_files = os.listdir('images')
    print(f"Files in images folder: {len(current_files)}")

    # Check for Arabic vs corrupted files
    arabic_files = [f for f in current_files if any('\u0600' <= char <= '\u06FF' for char in f)]
    corrupted_files = [f for f in current_files if any(char in f for char in ['╪', '┘', '╡', '║'])]
    normal_files = [f for f in current_files if f not in arabic_files and f not in corrupted_files]

    print(f"Arabic files: {len(arabic_files)}")
    print(f"Corrupted files: {len(corrupted_files)}")
    print(f"Normal (English) files: {len(normal_files)}")

    if arabic_files:
        print("\nSample Arabic files:")
        for file in arabic_files[:5]:
            print(f"  - {file}")

    if corrupted_files:
        print("\nSample corrupted files:")
        for file in corrupted_files[:5]:
            print(f"  - {file}")
else:
    print("❌ Images folder not found!")

=== CURRENT STATE ===
Files in images folder: 333
Arabic files: 160
Corrupted files: 0
Normal (English) files: 173

Sample Arabic files:
  - مرقوق.jpg
  - مقلوبة_الدجاج.png
  - شطة_حار.jpg
  - ربع_دجاج_شواية.jpg
  - الكليجة_السعودية.jpg


In [11]:
print("=== CREATING CORRECT MAPPING ===")

# Read both CSVs
df_arabic = pd.read_csv('SaudiFoodFile.csv')  # Original with Arabic names
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')  # Cleaned with English names

print(f"Arabic CSV entries: {len(df_arabic)}")
print(f"English CSV entries: {len(df_english)}")

# Create mapping based on position
mapping = {}
min_length = min(len(df_arabic), len(df_english))

print(f"\nCreating mapping for {min_length} files...\n")

for i in range(min_length):
    arabic_image = os.path.basename(df_arabic.iloc[i]['image_file'])
    english_image = df_english.iloc[i]['image_file']

    # Handle NaN values in dish names
    arabic_dish = df_arabic.iloc[i]['dish_name']
    english_dish = df_english.iloc[i]['dish_name']

    if pd.isna(arabic_dish):
        arabic_dish = "Unknown Arabic Dish"
    else:
        arabic_dish = str(arabic_dish)

    if pd.isna(english_dish):
        english_dish = "Unknown English Dish"
    else:
        english_dish = str(english_dish)

    mapping[arabic_image] = english_image
    print(f"{i+1:3d}. {arabic_dish[:25]} → {english_dish[:25]}")
    print(f"     {arabic_image[:30]} → {english_image}")

print(f"\n✓ Created mapping for {len(mapping)} files")

=== CREATING CORRECT MAPPING ===
Arabic CSV entries: 285
English CSV entries: 285

Creating mapping for 285 files...

  1. قهوة اللوز الحجازية التقل → Almond Coffee
     قهوة_اللوز_الحجازية_التقليدية. → almond_coffee.jpg
  2. شكشوكة حجازية لليوم الوطن → Hejaz Shakshuka
     شكشوكة_حجازية_لليوم_الوطني_الس → hejaz_shakshuka.jpg
  3. كبسة اللحمة السعودية وسلط → Meat Kabsa And Daqoos Sal
     كبسة_اللحمة_السعودية_وسلطة_الد → meat_kabsa_and_daqoos_salad.jpg
  4. طريقة عمل الكليجة السعودي → Kleja
     طريقة_عمل_الكليجة_السعودية.jpg → kleja.jpg
  5. كبسة الدجاج على الطريقة ا → Chicken Kabsa
     كبسة_الدجاج_على_الطريقة_السعود → chicken_kabsa.jpg
  6. طريقة البسبوسة السعودية ل → The Basbousa Hosting The 
     طريقة_البسبوسة_السعودية_لضيافة → the_basbousa_hosting_the_national_day.jpg
  7. أرز الكبسة السعودي دون لح → Kabsa Rice
     أرز_الكبسة_السعودي_دون_لحم_أو_ → kabsa_rice.jpg
  8. فول على الطريقة الحجازية → Beans In The Style
     فول_على_الطريقة_الحجازية.jpg → beans_in_the_style.jpg
  9. كع

In [12]:
print("=== RENAMING ARABIC FILES TO ENGLISH ===")

image_dir = 'images'
success_count = 0
error_count = 0
not_found_count = 0

# Check what Arabic files we actually have
arabic_files_in_folder = [f for f in os.listdir(image_dir) if any('\u0600' <= char <= '\u06FF' for char in f)]
print(f"Arabic files found in folder: {len(arabic_files_in_folder)}")

print("\nStarting renaming process...\n")

for arabic_name, english_name in mapping.items():
    arabic_path = os.path.join(image_dir, arabic_name)
    english_path = os.path.join(image_dir, english_name)

    if os.path.exists(arabic_path):
        try:
            os.rename(arabic_path, english_path)
            print(f"✓ {arabic_name[:35]} → {english_name}")
            success_count += 1
        except Exception as e:
            print(f"✗ Error: {arabic_name} → {e}")
            error_count += 1
    else:
        # File not found - might already be renamed or missing
        not_found_count += 1
        print(f"⚠️ Not found: {arabic_name}")

print(f"\n=== RENAMING SUMMARY ===")
print(f"✅ Successfully renamed: {success_count}")
print(f"❌ Errors: {error_count}")
print(f"⚠️  Not found: {not_found_count}")

=== RENAMING ARABIC FILES TO ENGLISH ===
Arabic files found in folder: 160

Starting renaming process...

✓ قهوة_اللوز_الحجازية_التقليدية.jpg → almond_coffee.jpg
✓ شكشوكة_حجازية_لليوم_الوطني_السعودي. → hejaz_shakshuka.jpg
✓ كبسة_اللحمة_السعودية_وسلطة_الدقوس.j → meat_kabsa_and_daqoos_salad.jpg
✓ طريقة_عمل_الكليجة_السعودية.jpg → kleja.jpg
✓ كبسة_الدجاج_على_الطريقة_السعودية.jp → chicken_kabsa.jpg
✓ طريقة_البسبوسة_السعودية_لضيافة_اليو → the_basbousa_hosting_the_national_day.jpg
⚠️ Not found: أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
✓ فول_على_الطريقة_الحجازية.jpg → beans_in_the_style.jpg
✓ كعك_العيد_السعودي.jpg → eid_cakes.jpg
✓ الكليجة_السعودية.jpg → kleja_2.jpg
✓ ثريد_اللحم_بالخضار_السعودي.jpg → meat_porridge.jpg
✓ العريكة_السعودية.jpg → arabia.jpg
✓ المصابيب_السعودية.jpg → lamps.jpg
✓ الجريش_بالدجاج_لسفرة_غداء_اليوم_الو → jareesh.jpg
✓ البسبوسة_الحجازية.jpg → basbousa.jpg
✓ الكبسة_السعودية_بالزبيب_واللوز.jpg → kabsa.jpg
✓ طريقة_عمل_المطبق_المالح.jpg → salty_mutabak.jpg
⚠️ Not found: الجريش

In [13]:
print("=== FINAL VERIFICATION ===")

final_files = os.listdir('images')
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')

# Count matches
matching_count = 0
for name in df_english['image_file']:
    if os.path.exists(os.path.join('images', name)):
        matching_count += 1

print(f"Files matching English CSV: {matching_count}/{len(df_english)}")
print(f"Success rate: {matching_count/len(df_english)*100:.1f}%")

# Show progress
previous_count = 153
if matching_count > previous_count:
    improvement = matching_count - previous_count
    print(f"🎉 Improved by {improvement} files! (from {previous_count} to {matching_count})")
elif matching_count == previous_count:
    print("⚠️  No improvement - same count as before")
else:
    print(f"❌ Count decreased from {previous_count} to {matching_count}")

# Show remaining file types
arabic_files = [f for f in final_files if any('\u0600' <= char <= '\u06FF' for char in f)]
english_files = [f for f in final_files if f in df_english['image_file'].values]

print(f"\n=== FOLDER BREAKDOWN ===")
print(f"Total files in folder: {len(final_files)}")
print(f"Properly named English files: {len(english_files)}")
print(f"Remaining Arabic files: {len(arabic_files)}")
print(f"Other files: {len(final_files) - len(english_files) - len(arabic_files)}")

if arabic_files:
    print(f"\nRemaining Arabic files (first 10):")
    for file in arabic_files[:10]:
        print(f"  - {file}")

# Show missing files from CSV
missing_files = []
for name in df_english['image_file']:
    if not os.path.exists(os.path.join('images', name)):
        missing_files.append(name)

if missing_files:
    print(f"\nStill missing from CSV: {len(missing_files)} files")
    for i, file in enumerate(missing_files[:10]):
        print(f"  {i+1}. {file}")
    if len(missing_files) > 10:
        print(f"  ... and {len(missing_files) - 10} more")

=== FINAL VERIFICATION ===
Files matching English CSV: 273/285
Success rate: 95.8%
🎉 Improved by 120 files! (from 153 to 273)

=== FOLDER BREAKDOWN ===
Total files in folder: 333
Properly named English files: 273
Remaining Arabic files: 23
Other files: 37

Remaining Arabic files (first 10):
  - لقيمات_محشية_بالقشطة.jpg
  - أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
  - السليق_الطائفي.jpg
  - القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
  - القهوة_السعودية_لضيافة_العيد.jpg
  - الحنيني_السعودي.jpg
  - مشروب_حليب_بالهيل_لضيافة_اليوم_الوطني_السعودي.jpg
  - المندي_السعودي_في_المنزل.jpg
  - الأرز_الحساوي_باللحم.jpg
  - طريقة_عمل_كبسة_لحم.jpg

Still missing from CSV: 12 files
  1. kabsa_rice.jpg
  2. white_jareesh.jpg
  3. hasawi_red_bread.jpg
  4. sectarian_bloodshed.jpg
  5. mamoul_hegazy_its_origins.jpg
  6. mixed_vegetables.jpg
  7. mulukhiyah.jpg
  8. hummus.png
  9. sweetheart.png
  10. mansaf.png
  ... and 2 more


In [14]:
print("=== FIXING REMAINING FILES ===")

# Let's check why these 23 Arabic files weren't renamed
remaining_arabic = [f for f in os.listdir('images') if any('\u0600' <= char <= '\u06FF' for char in f)]
df_arabic = pd.read_csv('SaudiFoodFile.csv')
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')

print(f"Remaining Arabic files: {len(remaining_arabic)}")
print(f"Missing from CSV: 12 files")

# Let's manually map the remaining Arabic files
manual_mapping = {
    # These are likely files that didn't match by position
    'لقيمات_محشية_بالقشطة.jpg': 'luqaimat.jpg',
    'أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg': 'kabsa_rice.jpg',
    'السليق_الطائفي.jpg': 'sectarian_bloodshed.jpg',
    'القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg': 'almond_coffee.jpg',
    'القهوة_السعودية_لضيافة_العيد.jpg': 'peel_coffee.jpg',
    'الحنيني_السعودي.jpg': 'henainee.jpg',
    'مشروب_حليب_بالهيل_لضيافة_اليوم_الوطني_السعودي.jpg': 'milk.jpg',
    'المندي_السعودي_في_المنزل.jpg': 'mandi_chicken.jpg',
    'الأرز_الحساوي_باللحم.jpg': 'hasawi_red_bread.jpg',  # This might be wrong - need to check
    'طريقة_عمل_كبسة_لحم.jpg': 'meat_kabsa_and_daqoos_salad.jpg'
}

print("\nAttempting to map remaining Arabic files...")
success_count = 0

for arabic_name, english_name in manual_mapping.items():
    arabic_path = os.path.join('images', arabic_name)
    english_path = os.path.join('images', english_name)

    if os.path.exists(arabic_path):
        try:
            os.rename(arabic_path, english_path)
            print(f"✓ {arabic_name[:40]} → {english_name}")
            success_count += 1
        except Exception as e:
            print(f"✗ Error: {arabic_name} → {e}")
    else:
        print(f"⚠️ Not found: {arabic_name}")

print(f"\nManually renamed: {success_count} files")

=== FIXING REMAINING FILES ===
Remaining Arabic files: 23
Missing from CSV: 12 files

Attempting to map remaining Arabic files...
✓ لقيمات_محشية_بالقشطة.jpg → luqaimat.jpg
⚠️ Not found: أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
⚠️ Not found: السليق_الطائفي.jpg
⚠️ Not found: القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
✓ القهوة_السعودية_لضيافة_العيد.jpg → peel_coffee.jpg
✓ الحنيني_السعودي.jpg → henainee.jpg
✓ مشروب_حليب_بالهيل_لضيافة_اليوم_الوطني_ال → milk.jpg
✓ المندي_السعودي_في_المنزل.jpg → mandi_chicken.jpg
⚠️ Not found: الأرز_الحساوي_باللحم.jpg
✓ طريقة_عمل_كبسة_لحم.jpg → meat_kabsa_and_daqoos_salad.jpg

Manually renamed: 6 files


In [15]:
print("=== FINAL CLEANUP ===")

# Check current status
final_files = os.listdir('images')
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')

# Count matches
matching_count = 0
for name in df_english['image_file']:
    if os.path.exists(os.path.join('images', name)):
        matching_count += 1

print(f"Files matching English CSV: {matching_count}/{len(df_english)}")
print(f"Success rate: {matching_count/len(df_english)*100:.1f}%")

# Show remaining issues
remaining_arabic = [f for f in final_files if any('\u0600' <= char <= '\u06FF' for char in f)]
missing_from_csv = []
for name in df_english['image_file']:
    if not os.path.exists(os.path.join('images', name)):
        missing_from_csv.append(name)

print(f"\n=== REMAINING ISSUES ===")
print(f"Remaining Arabic files: {len(remaining_arabic)}")
print(f"Missing from CSV: {len(missing_from_csv)}")

if remaining_arabic:
    print(f"\nRemaining Arabic files:")
    for file in remaining_arabic:
        print(f"  - {file}")

if missing_from_csv:
    print(f"\nStill missing from CSV:")
    for file in missing_from_csv:
        print(f"  - {file}")

# Check for duplicate files (same content, different names)
print(f"\n=== DUPLICATE CHECK ===")
import hashlib

def get_file_hash(filepath):
    """Get MD5 hash of file content"""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# Check a few files for duplicates (optional - can be slow for many files)
print("Checking for potential duplicates...")
file_hashes = {}
potential_duplicates = []

for file in final_files[:50]:  # Check first 50 files to avoid being too slow
    file_path = os.path.join('images', file)
    if os.path.isfile(file_path):
        file_hash = get_file_hash(file_path)
        if file_hash in file_hashes:
            potential_duplicates.append((file, file_hashes[file_hash]))
        else:
            file_hashes[file_hash] = file

if potential_duplicates:
    print("Potential duplicates found:")
    for dup in potential_duplicates:
        print(f"  - {dup[0]} might be duplicate of {dup[1]}")
else:
    print("No obvious duplicates found in first 50 files")

=== FINAL CLEANUP ===
Files matching English CSV: 273/285
Success rate: 95.8%

=== REMAINING ISSUES ===
Remaining Arabic files: 17
Missing from CSV: 12

Remaining Arabic files:
  - أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
  - السليق_الطائفي.jpg
  - القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
  - الأرز_الحساوي_باللحم.jpg
  - كباب_ميرو_السعودي.jpg
  - الجريش_الأبيض_بالدجاج.jpg
  - طريقة_عمل_الكبسة_السعودية_باللحمة.jpg
  - المقلقل_السعودي_الوصفة_الأصلية.jpg
  - الخبز_الأحمر_الحساوي.jpg
  - شاي_الجمر.jpg
  - ملوخية 2.jpg
  - طريقة_عمل_الكبسة_بالدجاج_لغداء_اليوم_الوطني_السعودي.jpg
  - الدبيازة_حلى_ضيافة_لليوم_الوطني_السعودي.jpg
  - معمول_حجازي_على_أصوله.jpg
  - خضار_مشكل 2.jpg
  - القشد_التقليدي_لضيافة_اليوم_الوطني_السعودي.jpg
  - كبسة_الدجاج_بالزعفران.jpg

Still missing from CSV:
  - kabsa_rice.jpg
  - white_jareesh.jpg
  - hasawi_red_bread.jpg
  - sectarian_bloodshed.jpg
  - mamoul_hegazy_its_origins.jpg
  - mixed_vegetables.jpg
  - mulukhiyah.jpg
  - hummus.png
  - sweetheart

In [16]:
print("=== FINDING MISSING IMAGES FOR CSV ROWS ===")

df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')
current_files = os.listdir('images')

# Identify which CSV rows are missing images
missing_rows = []
for index, row in df_english.iterrows():
    image_path = os.path.join('images', row['image_file'])
    if not os.path.exists(image_path):
        missing_rows.append((index, row['dish_name'], row['image_file']))

print(f"CSV rows missing images: {len(missing_rows)}/{len(df_english)}")

if missing_rows:
    print("\nMissing rows details:")
    for idx, dish_name, image_file in missing_rows:
        print(f"Row {idx}: {dish_name} -> {image_file}")

    # Check if we have any Arabic files that could be matched to these missing rows
    arabic_files = [f for f in current_files if any('\u0600' <= char <= '\u06FF' for char in f)]
    print(f"\nAvailable Arabic files that could be used: {len(arabic_files)}")

    if arabic_files:
        print("Available Arabic files:")
        for file in arabic_files:
            print(f"  - {file}")

=== FINDING MISSING IMAGES FOR CSV ROWS ===
CSV rows missing images: 12/285

Missing rows details:
Row 6: Kabsa Rice -> kabsa_rice.jpg
Row 17: White Jareesh -> white_jareesh.jpg
Row 19: Hasawi Red Bread -> hasawi_red_bread.jpg
Row 22: Sectarian Bloodshed -> sectarian_bloodshed.jpg
Row 26: Mamoul Hegazy Its Origins -> mamoul_hegazy_its_origins.jpg
Row 40: Mixed Vegetables -> mixed_vegetables.jpg
Row 42: Mulukhiyah -> mulukhiyah.jpg
Row 193: Hummus -> hummus.png
Row 198: Sweetheart -> sweetheart.png
Row 199: Mansaf -> mansaf.png
Row 210: Basbousa Semolina Cake -> basbousa_semolina_cake.jpg
Row 212: Chicken Shawarma Bowl -> chicken_shawarma_bowl.jpg

Available Arabic files that could be used: 17
Available Arabic files:
  - أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
  - السليق_الطائفي.jpg
  - القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
  - الأرز_الحساوي_باللحم.jpg
  - كباب_ميرو_السعودي.jpg
  - الجريش_الأبيض_بالدجاج.jpg
  - طريقة_عمل_الكبسة_السعودية_باللحمة.jpg
  - المقلقل_السعودي_ال

In [17]:
print("=== COUNTING RENAMED PICTURES ===")

import pandas as pd
import os

# Read the original Arabic CSV and current English CSV
df_arabic = pd.read_csv('SaudiFoodFile.csv')  # Original with Arabic names
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')  # Cleaned with English names

# Get current files in images folder
current_files = os.listdir('images')

# Count how many Arabic-named files were successfully renamed to English
successfully_renamed = 0
arabic_files_remaining = 0
english_files_found = 0

for file in current_files:
    if any('\u0600' <= char <= '\u06FF' for char in file):
        arabic_files_remaining += 1
    elif file in df_english['image_file'].values:
        english_files_found += 1

# Calculate successfully renamed (approximate)
# We started with 160 Arabic files, now we have some remaining
initial_arabic_count = 160  # From our earlier count
current_arabic_count = len([f for f in current_files if any('\u0600' <= char <= '\u06FF' for char in f)])

successfully_renamed = initial_arabic_count - current_arabic_count

print(f"=== RENAMING SUMMARY ===")
print(f"Initial Arabic files: {initial_arabic_count}")
print(f"Current Arabic files remaining: {current_arabic_count}")
print(f"Successfully renamed: {successfully_renamed} files")
print(f"English files matching CSV: {english_files_found}/{len(df_english)}")
print(f"Success rate: {english_files_found/len(df_english)*100:.1f}%")

# Show detailed breakdown
print(f"\n=== DETAILED BREAKDOWN ===")
print(f"Total files in folder: {len(current_files)}")
print(f" - English files (matched to CSV): {english_files_found}")
print(f" - Arabic files remaining: {current_arabic_count}")
print(f" - Other files: {len(current_files) - english_files_found - current_arabic_count}")

# Show remaining Arabic files
if current_arabic_count > 0:
    print(f"\nRemaining Arabic files ({current_arabic_count}):")
    arabic_files = [f for f in current_files if any('\u0600' <= char <= '\u06FF' for char in f)]
    for i, file in enumerate(arabic_files[:10]):
        print(f"  {i+1}. {file}")
    if len(arabic_files) > 10:
        print(f"  ... and {len(arabic_files) - 10} more")

# Show missing English files
missing_english = []
for name in df_english['image_file']:
    if not os.path.exists(os.path.join('images', name)):
        missing_english.append(name)

if missing_english:
    print(f"\nMissing English files ({len(missing_english)}):")
    for i, file in enumerate(missing_english[:10]):
        print(f"  {i+1}. {file}")
    if len(missing_english) > 10:
        print(f"  ... and {len(missing_english) - 10} more")

print(f"\n=== FINAL ASSESSMENT ===")
if english_files_found == len(df_english):
    print("🎉 PERFECT! All CSV rows have matching images!")
elif english_files_found >= 280:
    print("🚀 EXCELLENT! Over 98% of files are properly renamed!")
elif english_files_found >= 270:
    print("✅ GREAT! Over 95% of files are properly renamed!")
elif english_files_found >= 250:
    print("👍 GOOD! Majority of files are properly renamed!")
else:
    print("⚠️  Some work still needed on remaining files")

=== COUNTING RENAMED PICTURES ===
=== RENAMING SUMMARY ===
Initial Arabic files: 160
Current Arabic files remaining: 17
Successfully renamed: 143 files
English files matching CSV: 273/285
Success rate: 95.8%

=== DETAILED BREAKDOWN ===
Total files in folder: 327
 - English files (matched to CSV): 273
 - Arabic files remaining: 17
 - Other files: 37

Remaining Arabic files (17):
  1. أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
  2. السليق_الطائفي.jpg
  3. القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
  4. الأرز_الحساوي_باللحم.jpg
  5. كباب_ميرو_السعودي.jpg
  6. الجريش_الأبيض_بالدجاج.jpg
  7. طريقة_عمل_الكبسة_السعودية_باللحمة.jpg
  8. المقلقل_السعودي_الوصفة_الأصلية.jpg
  9. الخبز_الأحمر_الحساوي.jpg
  10. شاي_الجمر.jpg
  ... and 7 more

Missing English files (12):
  1. kabsa_rice.jpg
  2. white_jareesh.jpg
  3. hasawi_red_bread.jpg
  4. sectarian_bloodshed.jpg
  5. mamoul_hegazy_its_origins.jpg
  6. mixed_vegetables.jpg
  7. mulukhiyah.jpg
  8. hummus.png
  9. sweetheart.png
  10. m

In [18]:
print("=== COMPLETE DATASET ANALYSIS ===")

import pandas as pd
import os

# 1. How many rows in the CSV?
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')
csv_rows = len(df_english)
print(f"1. ROWS IN CSV: {csv_rows}")

# 2. How many images in the folder?
if os.path.exists('images'):
    image_files = os.listdir('images')
    total_images = len(image_files)
    print(f"2. IMAGES IN FOLDER: {total_images}")
else:
    print("2. IMAGES IN FOLDER: Folder 'images' not found!")
    total_images = 0

# 3. How many rows have corresponding images?
matching_count = 0
matching_files = []
for name in df_english['image_file']:
    if os.path.exists(os.path.join('images', name)):
        matching_count += 1
        matching_files.append(name)

print(f"3. ROWS WITH CORRESPONDING IMAGES: {matching_count}/{csv_rows}")

# 4. Calculate percentages
matching_percentage = (matching_count / csv_rows) * 100 if csv_rows > 0 else 0
print(f"4. SUCCESS RATE: {matching_percentage:.1f}%")

# 5. Detailed breakdown of images in folder
if os.path.exists('images'):
    print(f"\n=== IMAGE FOLDER BREAKDOWN ===")

    # Count by file type
    jpg_count = len([f for f in image_files if f.lower().endswith('.jpg')])
    png_count = len([f for f in image_files if f.lower().endswith('.png')])
    jpeg_count = len([f for f in image_files if f.lower().endswith('.jpeg')])
    other_count = total_images - jpg_count - png_count - jpeg_count

    print(f"JPG files: {jpg_count}")
    print(f"PNG files: {png_count}")
    print(f"JPEG files: {jpeg_count}")
    print(f"Other files: {other_count}")

    # Count Arabic vs English files
    arabic_files = [f for f in image_files if any('\u0600' <= char <= '\u06FF' for char in f)]
    english_matched = [f for f in image_files if f in df_english['image_file'].values]
    other_english = [f for f in image_files if f not in df_english['image_file'].values and not any('\u0600' <= char <= '\u06FF' for char in f)]

    print(f"\nArabic-named files: {len(arabic_files)}")
    print(f"English files matching CSV: {len(english_matched)}")
    print(f"Other English files: {len(other_english)}")

# 6. Show missing files
missing_files = []
for name in df_english['image_file']:
    if not os.path.exists(os.path.join('images', name)):
        missing_files.append(name)

if missing_files:
    print(f"\n=== MISSING FILES ({len(missing_files)}) ===")
    for i, file in enumerate(missing_files[:15]):
        print(f"  {i+1}. {file}")
    if len(missing_files) > 15:
        print(f"  ... and {len(missing_files) - 15} more")

# 7. Summary
print(f"\n=== SUMMARY ===")
print(f"📊 CSV Rows: {csv_rows}")
print(f"🖼️  Total Images: {total_images}")
print(f"✅ Matched Rows: {matching_count}")
print(f"📈 Success Rate: {matching_percentage:.1f}%")
print(f"❌ Missing: {csv_rows - matching_count}")

if matching_count == csv_rows:
    print("🎉 PERFECT! Every CSV row has a corresponding image!")
elif matching_percentage >= 95:
    print("🚀 EXCELLENT! Almost all rows have images!")
elif matching_percentage >= 80:
    print("✅ GOOD! Most rows have images!")
else:
    print("⚠️  Significant number of images missing")

=== COMPLETE DATASET ANALYSIS ===
1. ROWS IN CSV: 285
2. IMAGES IN FOLDER: 327
3. ROWS WITH CORRESPONDING IMAGES: 273/285
4. SUCCESS RATE: 95.8%

=== IMAGE FOLDER BREAKDOWN ===
JPG files: 235
PNG files: 79
JPEG files: 0
Other files: 13

Arabic-named files: 17
English files matching CSV: 273
Other English files: 37

=== MISSING FILES (12) ===
  1. kabsa_rice.jpg
  2. white_jareesh.jpg
  3. hasawi_red_bread.jpg
  4. sectarian_bloodshed.jpg
  5. mamoul_hegazy_its_origins.jpg
  6. mixed_vegetables.jpg
  7. mulukhiyah.jpg
  8. hummus.png
  9. sweetheart.png
  10. mansaf.png
  11. basbousa_semolina_cake.jpg
  12. chicken_shawarma_bowl.jpg

=== SUMMARY ===
📊 CSV Rows: 285
🖼️  Total Images: 327
✅ Matched Rows: 273
📈 Success Rate: 95.8%
❌ Missing: 12
🚀 EXCELLENT! Almost all rows have images!


In [22]:
print("=== CREATING FINAL CSV WITH ACTUAL IMAGE STATUS ===")

import pandas as pd
import os

# Read the original cleaned CSV
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')

# Add a column to indicate which images actually exist
df_final = df_english.copy()
df_final['image_exists'] = False
df_final['actual_image_file'] = ''  # Will show what file actually exists

# Check each row and mark if image exists
for index, row in df_final.iterrows():
    image_path = os.path.join('images', row['image_file'])
    if os.path.exists(image_path):
        df_final.at[index, 'image_exists'] = True
        df_final.at[index, 'actual_image_file'] = row['image_file']
    else:
        # Check if there's an Arabic version or similar file
        df_final.at[index, 'actual_image_file'] = 'MISSING'

# Count statistics
total_rows = len(df_final)
existing_images = df_final['image_exists'].sum()
missing_images = total_rows - existing_images

print(f"=== FINAL DATASET STATUS ===")
print(f"Total rows in CSV: {total_rows}")
print(f"Rows with existing images: {existing_images}")
print(f"Rows with missing images: {missing_images}")
print(f"Success rate: {existing_images/total_rows*100:.1f}%")

# Save the final CSV with status information
df_final.to_csv('SaudiFoodFile_FINAL_WITH_STATUS.csv', index=False)
print("✓ Saved: SaudiFoodFile_FINAL_WITH_STATUS.csv")

# Also save a clean version with only the rows that have images
df_clean_only = df_final[df_final['image_exists'] == True].copy()
df_clean_only = df_clean_only.drop(['image_exists', 'actual_image_file'], axis=1)
df_clean_only.to_csv('SaudiFoodFile_CLEAN_ONLY.csv', index=False)
print("✓ Saved: SaudiFoodFile_CLEAN_ONLY.csv (273 rows with images)")

# Show missing files
missing_files = df_final[df_final['image_exists'] == False]['image_file'].tolist()
if missing_files:
    print(f"\n=== MISSING FILES ({len(missing_files)}) ===")
    for file in missing_files:
        print(f"  - {file}")

print(f"\n=== YOUR FINAL DATASETS ===")
print("1. SaudiFoodFile_FINAL_WITH_STATUS.csv")
print("   - 285 rows total")
print("   - Shows which images exist and which are missing")
print("   - Includes 'image_exists' and 'actual_image_file' columns")
print()
print("2. SaudiFoodFile_CLEAN_ONLY.csv")
print("   - 273 rows (only rows with existing images)")
print("   - Clean format, ready for machine learning")
print()
print("🎯 You now have both options:")
print("   - Full dataset with status info (285 rows)")
print("   - Clean dataset with only available images (273 rows)")

=== CREATING FINAL CSV WITH ACTUAL IMAGE STATUS ===
=== FINAL DATASET STATUS ===
Total rows in CSV: 285
Rows with existing images: 283
Rows with missing images: 2
Success rate: 99.3%
✓ Saved: SaudiFoodFile_FINAL_WITH_STATUS.csv
✓ Saved: SaudiFoodFile_CLEAN_ONLY.csv (273 rows with images)

=== MISSING FILES (2) ===
  - mamoul_hegazy_its_origins.jpg
  - mansaf.png

=== YOUR FINAL DATASETS ===
1. SaudiFoodFile_FINAL_WITH_STATUS.csv
   - 285 rows total
   - Shows which images exist and which are missing
   - Includes 'image_exists' and 'actual_image_file' columns

2. SaudiFoodFile_CLEAN_ONLY.csv
   - 273 rows (only rows with existing images)
   - Clean format, ready for machine learning

🎯 You now have both options:
   - Full dataset with status info (285 rows)
   - Clean dataset with only available images (273 rows)


In [23]:
print("=== FINDING THE RENAMED IMAGE FILES ===")

import os

# The images are in the 'images' folder
image_folder = 'images'

if os.path.exists(image_folder):
    # List all files in the images folder
    all_files = os.listdir(image_folder)
    print(f"Total image files in folder: {len(all_files)}")

    # Show the renamed files (English names)
    english_files = [f for f in all_files if not any('\u0600' <= char <= '\u06FF' for char in f)]
    print(f"Renamed (English) files: {len(english_files)}")

    print("\n=== RENAMED IMAGE FILES ===")
    print("These are your 273 successfully renamed images:")
    for i, file in enumerate(english_files[:20]):  # Show first 20
        print(f"{i+1:2d}. {file}")

    if len(english_files) > 20:
        print(f"... and {len(english_files) - 20} more renamed files")

    # Show the location
    current_directory = os.getcwd()
    full_image_path = os.path.join(current_directory, image_folder)
    print(f"\n📍 IMAGE FOLDER LOCATION:")
    print(f"   {full_image_path}")

    # Show file sizes
    print(f"\n📁 FOLDER CONTENTS:")
    total_size = 0
    for file in english_files[:10]:  # Check first 10 files
        file_path = os.path.join(image_folder, file)
        file_size = os.path.getsize(file_path)
        total_size += file_size
        print(f"   {file} - {file_size/1024:.1f} KB")

    print(f"\nEstimated total size: {total_size/1024/1024:.1f} MB (for first 10 files)")

else:
    print("❌ Images folder not found!")

=== FINDING THE RENAMED IMAGE FILES ===
Total image files in folder: 336
Renamed (English) files: 320

=== RENAMED IMAGE FILES ===
These are your 273 successfully renamed images:
 1. meat_soup.jpg
 2. small_marqouq.jpg
 3. date_brownies_-_fufu_s_kitchen.jpg
 4. ground_kabsa_chicken.jpg
 5. shish_tawook_kabsa.png
 6. sectarian_bloodshed.jpg
 7. mandi_rice.png
 8. jelly. Png
 9. chicken.jpg
10. call.jpg
11. one_piece_of_chicken.jpg
12. lamb_kabsa.JPG
13. jareesh_2.jpg
14. shakshuka.jpg
15. rotisserie_chicken_summer_offer.jpg
16. smokey.png
17. almond_coffee_cake.jpg
18. patlican_soslu_turkish_eggplant_tomato_dip_-_fufu_.jpg
19. umm_ali. Png
20. mansaf_2.png
... and 300 more renamed files

📍 IMAGE FOLDER LOCATION:
   /content/images

📁 FOLDER CONTENTS:
   meat_soup.jpg - 70.8 KB
   small_marqouq.jpg - 42.3 KB
   date_brownies_-_fufu_s_kitchen.jpg - 52.9 KB
   ground_kabsa_chicken.jpg - 55.4 KB
   shish_tawook_kabsa.png - 283.2 KB
   sectarian_bloodshed.jpg - 46.6 KB
   mandi_rice.png - 30

In [25]:
from google.colab import files
import zipfile
import os

print("=== DOWNLOADING ALL IMAGES AS ZIP ===")

# Create a zip file of the entire images folder
zip_filename = 'saudi_food_images.zip'

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for root, dirs, files_list in os.walk('images'):
        for file in files_list:
            file_path = os.path.join(root, file)
            # Add file to zip
            zipf.write(file_path, os.path.basename(file_path))

print(f"✓ Created zip file: {zip_filename}")
print(f"✓ Contains {len(os.listdir('images'))} images")

# Download the file
files.download(zip_filename)

print("✅ Download started! Check your browser downloads.")

=== DOWNLOADING ALL IMAGES AS ZIP ===
✓ Created zip file: saudi_food_images.zip
✓ Contains 337 images


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Download started! Check your browser downloads.


In [26]:
print("=== CHECKING WHAT'S IN THE IMAGES FOLDER ===")

import os
import pandas as pd

# Read the CSV to know what we should have
df_english = pd.read_csv('SaudiFoodFile_final_cleaned.csv')
print(f"CSV expects: {len(df_english)} images")

# Check what's actually in the images folder
if os.path.exists('images'):
    all_files = os.listdir('images')
    print(f"Actual files in images folder: {len(all_files)}")

    # Count renamed vs Arabic files
    english_files = [f for f in all_files if not any('\u0600' <= char <= '\u06FF' for char in f)]
    arabic_files = [f for f in all_files if any('\u0600' <= char <= '\u06FF' for char in f)]

    print(f"✅ Renamed (English) files: {len(english_files)}")
    print(f"⚠️  Arabic files remaining: {len(arabic_files)}")

    # Check how many CSV entries actually have images
    matching_count = 0
    for name in df_english['image_file']:
        if os.path.exists(os.path.join('images', name)):
            matching_count += 1

    print(f"\n=== ACTUAL SITUATION ===")
    print(f"CSV rows: {len(df_english)}")
    print(f"Rows with matching images: {matching_count}")
    print(f"Missing images: {len(df_english) - matching_count}")
    print(f"Extra files in folder: {len(all_files) - matching_count}")

    # Show what's missing
    missing_files = []
    for name in df_english['image_file']:
        if not os.path.exists(os.path.join('images', name)):
            missing_files.append(name)

    if missing_files:
        print(f"\n❌ Missing files ({len(missing_files)}):")
        for file in missing_files:
            print(f"  - {file}")

    # Show remaining Arabic files
    if arabic_files:
        print(f"\n📝 Arabic files not renamed ({len(arabic_files)}):")
        for file in arabic_files[:10]:
            print(f"  - {file}")
        if len(arabic_files) > 10:
            print(f"  ... and {len(arabic_files) - 10} more")

    print(f"\n💡 SUMMARY: The ZIP will contain {len(all_files)} files:")
    print(f"   - {len(english_files)} renamed English files")
    print(f"   - {len(arabic_files)} Arabic files")
    print(f"   - {matching_count} files match the CSV")

else:
    print("❌ Images folder not found!")

=== CHECKING WHAT'S IN THE IMAGES FOLDER ===
CSV expects: 285 images
Actual files in images folder: 337
✅ Renamed (English) files: 321
⚠️  Arabic files remaining: 16

=== ACTUAL SITUATION ===
CSV rows: 285
Rows with matching images: 283
Missing images: 2
Extra files in folder: 54

❌ Missing files (2):
  - mamoul_hegazy_its_origins.jpg
  - mansaf.png

📝 Arabic files not renamed (16):
  - أرز_الكبسة_السعودي_دون_لحم_أو_دجاج.jpg
  - السليق_الطائفي.jpg
  - القهوة_السعودية_الأصلية_لاحتفال_اليوم_الوطني.jpg
  - الأرز_الحساوي_باللحم.jpg
  - كباب_ميرو_السعودي.jpg
  - الجريش_الأبيض_بالدجاج.jpg
  - طريقة_عمل_الكبسة_السعودية_باللحمة.jpg
  - المقلقل_السعودي_الوصفة_الأصلية.jpg
  - الخبز_الأحمر_الحساوي.jpg
  - شاي_الجمر.jpg
  ... and 6 more

💡 SUMMARY: The ZIP will contain 337 files:
   - 321 renamed English files
   - 16 Arabic files
   - 283 files match the CSV
