# **Data collection for Food Ingredient Image Classification by Web Scrapping from Unsplash**

This notebook presents an image classification task focused on food ingredients utilizing additional data acquired through **web scraping** via **Unsplash**.

The dataset has a rich collection of food ingredient images suitable for training a **Convolutional Neural Network (CNN)**.

### **Importing necessary libraries**

In [None]:
import os
import shutil
import requests
from bs4 import BeautifulSoup
from PIL import Image
import urllib
import tensorflow as tf
from keras.utils import image_dataset_from_directory
import matplotlib.pyplot as plt
from keras.applications.mobilenet_v2 import preprocess_input


### **Set current working directory path**

In [2]:
cwd= './Dataset'

### **Creating list of datasets**

In [18]:
datasets=['valid', 'test', 'train']

### **Organizing Classes into Directories**

In [8]:
classes = {
    'Apple': [os.path.join(cwd, dataset, 'Apple') for dataset in datasets],
    'Artichoke': [os.path.join(cwd, dataset, 'Artichoke') for dataset in datasets],
    'Asparagus': [os.path.join(cwd, dataset, 'Asparagus') for dataset in datasets],
    'Avocado': [os.path.join(cwd, dataset, 'Avocado') for dataset in datasets],
    'Bacon': [os.path.join(cwd, dataset, 'Bacon') for dataset in datasets],
    'Banana': [os.path.join(cwd, dataset, 'Banana') for dataset in datasets],
    'Beetroot': [os.path.join(cwd, dataset, 'Beetroot') for dataset in datasets],
    'Bitter Gourd': [os.path.join(cwd, dataset, 'Bitter Gourd') for dataset in datasets],
    'Bottle Gourd': [os.path.join(cwd, dataset, 'Bottle Gourd') for dataset in datasets],
    'Bread': [os.path.join(cwd, dataset, 'Bread') for dataset in datasets],
    'Brinjal': [os.path.join(cwd, dataset, 'Brinjal') for dataset in datasets],
    'Broccoli': [os.path.join(cwd, dataset, 'Broccoli') for dataset in datasets],
    'Butter': [os.path.join(cwd, dataset, 'Butter') for dataset in datasets],
    'Cabbage': [os.path.join(cwd, dataset, 'Cabbage') for dataset in datasets],
    'Capsicum': [os.path.join(cwd, dataset, 'Capsicum') for dataset in datasets],
    'Carrot': [os.path.join(cwd, dataset, 'Carrot') for dataset in datasets],
    'Cauliflower': [os.path.join(cwd, dataset, 'Cauliflower') for dataset in datasets],
    'Cheese': [os.path.join(cwd, dataset, 'Cheese') for dataset in datasets],
    'Chicken': [os.path.join(cwd, dataset, 'Chicken') for dataset in datasets],
    'Chickpeas': [os.path.join(cwd, dataset, 'Chickpeas') for dataset in datasets],
    'Chili Pepper': [os.path.join(cwd, dataset, 'Chili Pepper') for dataset in datasets],
    'Chili Powder': [os.path.join(cwd, dataset, 'Chili Powder') for dataset in datasets],
    'Chowmein Noodles': [os.path.join(cwd, dataset, 'Chowmein Noodles') for dataset in datasets],
    'Cinnamon': [os.path.join(cwd, dataset, 'Cinnamon') for dataset in datasets],
    'Coriander': [os.path.join(cwd, dataset, 'Coriander') for dataset in datasets],
    'Corn': [os.path.join(cwd, dataset, 'Corn') for dataset in datasets],
    'Cornflake': [os.path.join(cwd, dataset, 'Cornflake') for dataset in datasets],
    'Crab Meat': [os.path.join(cwd, dataset, 'Crab Meat') for dataset in datasets],
    'Cucumber': [os.path.join(cwd, dataset, 'Cucumber') for dataset in datasets],
    'Egg': [os.path.join(cwd, dataset, 'Egg') for dataset in datasets],
    'Fish': [os.path.join(cwd, dataset, 'Fish') for dataset in datasets],
    'Garlic': [os.path.join(cwd, dataset, 'Garlic') for dataset in datasets],
    'Ginger': [os.path.join(cwd, dataset, 'Ginger') for dataset in datasets],
    'Green Mint': [os.path.join(cwd, dataset, 'Green Mint') for dataset in datasets],
    'Green Peas': [os.path.join(cwd, dataset, 'Green Peas') for dataset in datasets],
    'Soyabean': [os.path.join(cwd, dataset, 'Soyabean') for dataset in datasets],
    'Ice': [os.path.join(cwd, dataset, 'Ice') for dataset in datasets],
    'Jack Fruit': [os.path.join(cwd, dataset, 'Jack Fruit') for dataset in datasets],
    'Ketchup': [os.path.join(cwd, dataset, 'Ketchup') for dataset in datasets],
    'Kimchi': [os.path.join(cwd, dataset, 'Kimchi') for dataset in datasets],
    'Lemon': [os.path.join(cwd, dataset, 'Lemon') for dataset in datasets],
    'Mayonnaise': [os.path.join(cwd, dataset, 'Mayonnaise') for dataset in datasets],
    'Milk': [os.path.join(cwd, dataset, 'Milk') for dataset in datasets],
    'Drumsticks': [os.path.join(cwd, dataset, 'Drumsticks') for dataset in datasets],
    'Mushroom': [os.path.join(cwd, dataset, 'Mushroom') for dataset in datasets],
    'Mutton': [os.path.join(cwd, dataset, 'Mutton') for dataset in datasets],
    'Okra': [os.path.join(cwd, dataset, 'Okra') for dataset in datasets],
    'Olive Oil': [os.path.join(cwd, dataset, 'Olive Oil') for dataset in datasets],
    'Onion': [os.path.join(cwd, dataset, 'Onion') for dataset in datasets],
    'Spring onion': [os.path.join(cwd, dataset, 'Spring onion') for dataset in datasets],
    'Orange': [os.path.join(cwd, dataset, 'Orange') for dataset in datasets],
    'Spinach': [os.path.join(cwd, dataset, 'Spinach') for dataset in datasets],
    'Paneer': [os.path.join(cwd, dataset, 'Paneer') for dataset in datasets],
    'Papaya': [os.path.join(cwd, dataset, 'Papaya') for dataset in datasets],
    'Pea': [os.path.join(cwd, dataset, 'Pea') for dataset in datasets],
    'Pear': [os.path.join(cwd, dataset, 'Pear') for dataset in datasets],
    'Pointed Gourd': [os.path.join(cwd, dataset, 'Pointed Gourd') for dataset in datasets],
    'Potato': [os.path.join(cwd, dataset, 'Potato') for dataset in datasets],
    'Pumpkin': [os.path.join(cwd, dataset, 'Pumpkin') for dataset in datasets],
    'Radish': [os.path.join(cwd, dataset, 'Radish') for dataset in datasets],
    'Red Beans': [os.path.join(cwd, dataset, 'Red Beans') for dataset in datasets],
    'Red Lentils': [os.path.join(cwd, dataset, 'Red Lentils') for dataset in datasets],
    'Rice': [os.path.join(cwd, dataset, 'Rice') for dataset in datasets],
    'Salt': [os.path.join(cwd, dataset, 'Salt') for dataset in datasets],
    'Sausage': [os.path.join(cwd, dataset, 'Sausage') for dataset in datasets],
    'Seaweed': [os.path.join(cwd, dataset, 'Seaweed') for dataset in datasets],
    'Snake Gourd': [os.path.join(cwd, dataset, 'Snake Gourd') for dataset in datasets],
    'Soy Sauce': [os.path.join(cwd, dataset, 'Soy Sauce') for dataset in datasets],
    'Soya Chunks': [os.path.join(cwd, dataset, 'Soya Chunks') for dataset in datasets],
    'Strawberry': [os.path.join(cwd, dataset, 'Strawberry') for dataset in datasets],
    'Sugar': [os.path.join(cwd, dataset, 'Sugar') for dataset in datasets],
    'Sweet Potato': [os.path.join(cwd, dataset, 'Sweet Potato') for dataset in datasets],
    'Tomato': [os.path.join(cwd, dataset, 'Tomato') for dataset in datasets],
    'Turnip': [os.path.join(cwd, dataset, 'Turnip') for dataset in datasets],
    'Walnut': [os.path.join(cwd, dataset, 'Walnut') for dataset in datasets],
    'Watermelon': [os.path.join(cwd, dataset, 'Watermelon') for dataset in datasets],
    'Wheat': [os.path.join(cwd, dataset, 'Wheat') for dataset in datasets],
    'Yellow Lentils': [os.path.join(cwd, dataset, 'Yellow Lentils') for dataset in datasets],
    'Mango': [os.path.join(cwd, dataset, 'Mango') for dataset in datasets],
    'Pomegranate': [os.path.join(cwd, dataset, 'Pomegranate') for dataset in datasets],
    'Pineapple': [os.path.join(cwd, dataset, 'Pineapple') for dataset in datasets],
    'Kiwi': [os.path.join(cwd, dataset, 'Kiwi') for dataset in datasets],
}

### **Create Directories for Datasets**

In [15]:
# Create a directory named 'Dataset' to store all datasets.
os.mkdir('Dataset')

# Loop through each dataset in the list of datasets.
for dataset in datasets:
    # Create a directory for each dataset inside the 'Dataset' directory.
    os.mkdir(os.path.join(cwd, dataset))

### **Store  number of records in each dataset**

In [19]:
number_of_images=[10, 10, 30]

### **Defined a function to scrape images from Unsplash for a given item and save them to the specified directory**

In [20]:
def scrape_images_for_items(item, directory, num_images_per_item, min_width=300, min_height=300):
    """
    Scrape and download images related to the specified item from Unsplash.

    Args:
        item (str): The item to search for images.
        directory (str): The directory to save the downloaded images.
        num_images_per_item (int): The number of images to download for the specified item.
        min_width (int, optional): The minimum width of the downloaded images (default is 300).
        min_height (int, optional): The minimum height of the downloaded images (default is 300).

    Returns:
        None
    """
    # Define the URL to scrape
    url = "https://unsplash.com/s/photos/{}".format(item)

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all image tags
    img_tags = soup.find_all('img')

    # Create a directory to save the images
    if not os.path.exists(directory):
        os.makedirs(directory, mode=0o755)

    # Download and save images
    downloaded_images = 0
    for i, img_tag in enumerate(img_tags):
        try:
            img_url = img_tag['src']
            img_name = "{}_{}.jpg".format(item, downloaded_images)
            img_path = os.path.join(directory, img_name)
            urllib.request.urlretrieve(img_url, img_path)

            # Check the dimensions of the downloaded image
            img = Image.open(img_path)
            width, height = img.size
            if width >= min_width and height >= min_height:
                downloaded_images += 1
                print("Image {}/{} for {} downloaded successfully".format(downloaded_images, num_images_per_item, item))

                # If the required number of images is reached, break the loop
                if downloaded_images == num_images_per_item:
                    break
            else:
                # Delete the image if it does not meet the size requirements
                os.remove(img_path)
        except Exception as e:
            print("Error downloading image {} for {}: {}".format(downloaded_images + 1, item, e))

### **Iterate over each ingredient and its corresponding dataset paths and scrape respective images**

In [23]:
# Iterate over each ingredient and its corresponding dataset paths
for ingredient, dataset in classes.items():

    # For each path in the dataset
    for path in dataset:
        # Extract the substring after splitting the path
        substring =path.split('/')[2]
        print(substring)  # Print the substring

        # Determine the number of images per item based on the substring
        if substring == "valid":
            num_images_per_item = number_of_images[0]
        elif substring == "test":
            num_images_per_item = number_of_images[1]
        else:
            num_images_per_item = number_of_images[2]

        # Call the scrape_images_for_items function for the ingredient, path, and number of images per item
        scrape_images_for_items(ingredient, path, num_images_per_item)

valid
Image 1/10 for Apple downloaded successfully
Image 2/10 for Apple downloaded successfully
Image 3/10 for Apple downloaded successfully
Image 4/10 for Apple downloaded successfully
Image 5/10 for Apple downloaded successfully
Image 6/10 for Apple downloaded successfully
Image 7/10 for Apple downloaded successfully
Image 8/10 for Apple downloaded successfully
Image 9/10 for Apple downloaded successfully
Image 10/10 for Apple downloaded successfully
test
Image 1/10 for Apple downloaded successfully
Image 2/10 for Apple downloaded successfully
Image 3/10 for Apple downloaded successfully
Image 4/10 for Apple downloaded successfully
Image 5/10 for Apple downloaded successfully
Image 6/10 for Apple downloaded successfully
Image 7/10 for Apple downloaded successfully
Image 8/10 for Apple downloaded successfully
Image 9/10 for Apple downloaded successfully
Image 10/10 for Apple downloaded successfully
train
Image 1/30 for Apple downloaded successfully
Image 2/30 for Apple downloaded succ

### **Convert all images to PNG file**

In [24]:
for dataset in datasets:

  # Define the directory containing JPEG images
  input_dir = os.path.join(cwd, dataset)
  print(input_dir)

  # Define the output directory for PNG images
  output_dir = os.path.join('converted_img', dataset)
  print(output_dir)

  try:

        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # Loop through each folder in the input directory
        for folder in os.listdir(input_dir):
            # Loop through each JPG file in the folder
            for filename in os.listdir(os.path.join(input_dir, folder)):
                try:
                    os.makedirs(os.path.join(output_dir,folder), exist_ok=True)
                    # Open the JPG image
                    with Image.open(os.path.join(input_dir, folder, filename)) as img:
                        # Convert the image to PNG format
                        png_filename = os.path.splitext(filename)[0] + '.png'
                        # Save the image in PNG format to the output directory
                        img.save(os.path.join(output_dir,folder, png_filename), format='PNG')
                except Exception as e:
                      print("Error processing file '{}' in folder '{}': {}".format(filename, folder, e))
  except Exception as e:
    print("Error processing dataset '{}': {}".format(dataset, e))


print("Conversion complete.")

./Dataset/valid
converted_img/valid
./Dataset/test
converted_img/test
./Dataset/train
converted_img/train
Conversion complete.
