## Statistical Analysis of the 'Food Ingredients and Recipes Dataset with Images'

- Obtained from: https://www.kaggle.com/datasets/pes12017000148/food-ingredients-and-recipe-dataset-with-images

In [10]:
# Libraries
import os
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
# Dataset loading
csv_path = "food-ingredients-and-recipe-dataset-with-images/Food Ingredients and Recipe Dataset with Image Name Mapping.csv" 
data = pd.read_csv(csv_path)

image_folder_path = "food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images"

In [14]:
# Function to analyze image dimensions
def analyze_image_dimensions(image_folder, image_names):
    widths, heights = [], []
    missing_images = []

    for image_name in image_names:
        image_path = os.path.join(image_folder, f"{image_name}.jpg")  # Assuming .jpg extension
        if os.path.exists(image_path):
            with Image.open(image_path) as img:
                widths.append(img.width)
                heights.append(img.height)
        else:
            missing_images.append(image_name)

    return {
        "widths": widths,
        "heights": heights,
        "missing_images": missing_images
    }

# Analyze image dimensions
image_stats = analyze_image_dimensions(image_folder_path, data["Image_Name"])

# Compute statistics for widths and heights
if image_stats["widths"]:
    image_widths = np.array(image_stats["widths"])
    image_heights = np.array(image_stats["heights"])

    width_stats = {"Min": image_widths.min(), "Mean": image_widths.mean(), "Max": image_widths.max()}
    height_stats = {"Min": image_heights.min(), "Mean": image_heights.mean(), "Max": image_heights.max()}
else:
    width_stats = height_stats = "No images found!"

# Function to analyze text statistics (e.g., title length, vocabulary)
def analyze_text_statistics(titles, ingredients):
    # Title length statistics
    title_lengths = titles.str.len()
    title_length_stats = {
        "Min": title_lengths.min(),
        "Mean": title_lengths.mean(),
        "Max": title_lengths.max()
    }
    
    # Vocabulary statistics
    words_in_titles = titles.str.split().explode()
    words_in_ingredients = ingredients.str.split().explode()
    common_words_titles = words_in_titles.value_counts().head(10).to_dict()
    common_words_ingredients = words_in_ingredients.value_counts().head(10).to_dict()
    rare_words_titles = words_in_titles.value_counts().tail(10).to_dict()
    rare_words_ingredients = words_in_ingredients.value_counts().tail(10).to_dict()

    return {
        "title_length_stats": title_length_stats,
        "common_words_titles": common_words_titles,
        "common_words_ingredients": common_words_ingredients,
        "rare_words_titles": rare_words_titles,
        "rare_words_ingredients": rare_words_ingredients
    }

# Analyze text statistics
text_stats = analyze_text_statistics(data["Title"], data["Ingredients"])

# Count total images and captions
total_images = len(data["Image_Name"].unique())
total_captions = len(data["Title"].unique())

# Combine results
results = {
    "Total Images": total_images,
    "Total Captions": total_captions,
    "Image Width Stats": width_stats,
    "Image Height Stats": height_stats,
    "Title Length Stats": text_stats["title_length_stats"],
    "Common Words in Titles": text_stats["common_words_titles"],
    "Common Words in Ingredients": text_stats["common_words_ingredients"],
    "Rare Words in Titles": text_stats["rare_words_titles"],
    "Rare Words in Ingredients": text_stats["rare_words_ingredients"],
    "Missing Images Count": len(image_stats["missing_images"])
}

In [15]:
# Display results
for key, value in results.items():
    print(f"{key}:\n{value}\n")

Total Images:
13472

Total Captions:
13306

Image Width Stats:
{'Min': 274, 'Mean': 274.0317719545691, 'Max': 702}

Image Height Stats:
{'Min': 169, 'Mean': 169.04105114690816, 'Max': 722}

Title Length Stats:
{'Min': 3.0, 'Mean': 32.7616330764671, 'Max': 112.0}

Common Words in Titles:
{'with': 4747, 'and': 4281, 'Salad': 953, 'Chicken': 821, 'Sauce': 666, 'Grilled': 570, 'With': 524, 'Cake': 442, 'Roasted': 441, 'Chocolate': 410}

Common Words in Ingredients:
{"'1": 38077, 'cup': 28116, "'2": 21758, 'teaspoon': 15987, 'tablespoons': 15543, "'1/2": 13476, 'or': 11490, 'fresh': 10104, 'cups': 10075, "'1/4": 9686}

Rare Words in Titles:
{'Board': 1, 'Thread': 1, 'Sinigang': 1, 'Bihon': 1, 'Eureka': 1, 'Infinite': 1, 'Burns': 1, 'Bobby': 1, 'Guisado)': 1, 'Hazelnut-Butter': 1}

Rare Words in Ingredients:
{"Nougat',": 1, 'cup/95': 1, "Chocolate']": 1, 'Gluten-free': 1, 'sunflower)': 1, "shoots']": 1, 'halved,layers': 1, "oolong',": 1, '(3-4': 1, 'epazote*': 1}

Missing Images Count:
30



### **Investigation on more images than captions**
- 13,472 images vs 13,306 captions

Unmmatched images

In [None]:
# Get image files from the folder
image_files = set(os.listdir(image_folder_path))  
image_files = {os.path.splitext(img)[0] for img in image_files}  # Remove extensions

# Get image names from the CSV
csv_images = set(data["Image_Name"].unique())

# Find unmatched images
unmatched_images = image_files - csv_images
print(f"Unmatched Images: {len(unmatched_images)}")
print(unmatched_images)

Unmatched Images: 111
{'-candy-corn-frozen-citrus-cream-pops-368770', 'caramelized-onion-and-portobello-mushroom-soup-with-goat-cheese-croutons-106175', 'roasted-sweet-potatoes-with-honey-glaze-104728', 'herbed-summer-succotash-102026', '-halibut-confit-with-leeks-coriander-and-lemon-51252690', 'vietnamese-style-beef-noodle-soup-108724', '-pumpkin-gruyere-gratin-with-thyme-51252910', '-hazelnut-butter-and-coffee-meringues-51260360', 'chicken-with-tarragon-caper-sauce-with-mixed-greens-108454', 'pepper-crusted-steaks-with-worcestershire-glazed-portobellos-109702', 'panna-cotta-with-strawberry-vin-santo-sauce-103466', 'arugula-and-pear-salad-with-mascarpone-and-toasted-walnuts-107798', '-lentils-with-cucumbers-chard-and-poached-egg-51260640', 'sauteed-chicken-with-tomatoes-olives-and-feta-108456', 'soft-shelled-crabs-meuniere-104885', 'chilled-corn-soup-with-crab-and-chile-oil-105206', '-like-a-caesar-235836', 'baked-shrimp-toasts-108879', 'maine-lobster-with-wild-mushrooms-and-rosemary-

Identification of Duplicate Captions

In [None]:
duplicates = data["Title"].value_counts()[data["Title"].value_counts() > 1]
print(f"Number of Captions Associated with Multiple Images: {len(duplicates)}")
print(f"Captions with Multiple Associated Images:\n{duplicates}")

Number of Captions Associated with Multiple Images: 162
Captions with Multiple Associated Images:
Title
Potato Latkes                5
French 75                    5
Chopped Salad                4
Pickled Red Onions           4
Sazerac                      4
                            ..
Pear and Almond Tart         2
Pizza Margherita             2
Salted Chocolate Caramels    2
White Chicken Chili          2
Swedish Meatballs            2
Name: count, Length: 162, dtype: int64


Checking for missing captions

In [None]:
missing_captions = data[data["Title"].isna()]
print(f"Number of Images Without Captions: {len(missing_captions)}")
print(f"Rows with Missing Captions:\n{missing_captions}")

Number of Images Without Captions: 5
Rows with Missing Captions:
       Unnamed: 0 Title Ingredients Instructions  \
11221       11221   NaN          []          NaN   
12373       12373   NaN          []          NaN   
12378       12378   NaN          []          NaN   
12818       12818   NaN          []          NaN   
12829       12829   NaN          []          NaN   

                                              Image_Name Cleaned_Ingredients  
11221  roasted-game-hens-with-caramelized-root-vegeta...                ['']  
12373                      chicken-soup-with-rice-232605                ['']  
12378                           double-lemon-bars-232572                ['']  
12818  pear-and-frangipane-crostata-with-raspberry-vi...                ['']  
12829                  hazelnut-shortbread-sticks-231311                ['']  
