# Data Acquisition and Preprocessing

In [None]:
# Data manipulation
import pandas as pd

# Libraries for API querying
import json
import requests
from pprint import pprint

# Imports for image scaling and file access
import os
from PIL import Image
from tqdm import tqdm

A helper function which will be useful later on in nutrition data retrieval will be to get a list of all the food names present in our dataset.

In [None]:

def get_food_names():
    data_path = 'food-128'
    food_names = []

    # Extract food names from folders
    for food_name in os.listdir(data_path):
        full_path = os.path.join(data_path, food_name)
        if os.path.isdir(full_path):
            food_names.append(food_name.replace('_', ' '))
    return food_names
print(get_food_names())

Downscaling the images will help massively in getting a working model without needing to use the full image data. The method to do so is shown below.

In [None]:
# Downscale images (ChatGPT)
# Take our raw images

# Source and target directories
SRC_DIR = "images"        # original images folder with class subfolders
DST_DIR = "food-128"      # output folder

size = (128, 128)  # target resolution

# Create output root directory
os.makedirs(DST_DIR, exist_ok=True)

# Loop through every class folder
for class_name in os.listdir(SRC_DIR):
    src_class_path = os.path.join(SRC_DIR, class_name)
    dst_class_path = os.path.join(DST_DIR, class_name)

    # Skip non-directories (just in case)
    if not os.path.isdir(src_class_path):
        continue

    os.makedirs(dst_class_path, exist_ok=True)

    # Resize each image
    for img_name in tqdm(os.listdir(src_class_path), desc=f"Resizing {class_name[:20]}"):
        src_img_path = os.path.join(src_class_path, img_name)
        dst_img_path = os.path.join(dst_class_path, img_name)

        try:
            img = Image.open(src_img_path).convert("RGB")
            img = img.resize(size, Image.Resampling.BILINEAR)
            img.save(dst_img_path, quality=90)
        except Exception as e:
            print(f"Skipped {src_img_path}: {e}")

print("All images resized to 128x128 and saved in food-128")

## FoodData Central API Loading

For loading the API data, we'll first need to define some constants which will be used through data retrieval.

In [None]:
# API Setup
API_BASE_URL = 'https://api.nal.usda.gov/fdc/v1'
API_KEY = 'Ujn2RHBzVpsD6gjTUl83caAgyjbUhEN2HjJcFmn9'

To search the FoodData Central database, we'll use the following helper functions:

In [None]:
def search_food(query, data_type='Survey (FNDDS)', limit=10):
    # data_type can be any of 'Branded', 'Foundation', 'Survey (FNDDS)', 'SR Legacy'
    parameters = {
        'api_key': API_KEY,
        'query': query,
        'dataType': data_type,
        'pageSize': limit
    }
    response = requests.get(url=f'{API_BASE_URL}/foods/search', params=parameters)
    response.raise_for_status()
    return response.json()


def get_calories(food):
    for nutrient in food.get('foodNutrients', []):
        # TODO: test if {foodNutrientId': 34350077, nutrientName: 'Energy'} is the correct dict
        if nutrient.get('nutrientName') == 'Energy' and nutrient.get('unitName') == 'KCAL':
            return nutrient['value']
    return None


def simplify_results(data):
    foods = data.get('foods', [])
    result_foods = []
    for food in foods:
        calories = get_calories(food)
        if calories is not None:
            result_foods.append({
                'name': food['description'],
                'calories': calories
            })
    return result_foods

To test the helper functions' ability to query the API, we can search with a given dish name.

In [None]:
# Since the API calls could fail, use a try except block
try:
    food = input('Type a food:')
    response = search_food(food)
    simplified = simplify_results(response)
    print(len(simplified))
    pprint(simplified)
except Exception as e:
    print('Error:', e)

Now, we'll need to refine this data to contain only the relevant caloric information for the foods in our dataset. An important aspect of this to note is that the FoodData Central API is limited to 1000 requests per hour by IP address, and will block an IP address for one hour if this limit is exceded.

In [None]:
try:
    food_names = get_food_names()
    all_calorie_data = []
    for food in food_names:
        response = search_food(food)
        simplified = simplify_results(response)
        print('', food, sep='\n')
        pprint(simplified)
        all_calorie_data.append(simplified)
except Exception as e:
    print('Error:', e)


With the previous code block, we now have a list of the results of the keyword search for all 101 dishes. However, some of these contain no entries at all, and others contain many (limited to 10 for now) that may all be viable. In general, the first result seems to be the msot relevant, but there are exceptions. For now, we should find the names of food items which have no entry at all.

In [None]:
# Count and record missing dishes and dishes with only 1 result
missing_dishes = []
single_dishes = []
for i in range(len(food_names)):
    if all_calorie_data[i] == []:
        missing_dishes.append(food_names[i])
    elif len(all_calorie_data[i]) == 1:
        single_dishes.append(food_names[i])

print('Missing dish count:', len(missing_dishes))
pprint(missing_dishes)
print('\nSingle result dish count:', len(single_dishes))
pprint(single_dishes)

Turn Image Data Into Train and Test Sets


In [None]:
import random
import shutil

SRC_DIR = 'food-128'

TRAIN_DIR = 'food-128-split/train'
TEST_DIR = 'food-128-split/test'

# Output directories
os.makedirs(TRAIN_DIR)
os.makedirs(TEST_DIR)

cntr = 0
# Loop through each food subclass
for class_name in os.listdir(SRC_DIR):
    class_path = os.path.join(SRC_DIR, class_name)

    # Ensure its a folder
    if os.path.isdir(class_path):
        images = os.listdir(class_path)

        # Shuffle images
        random.shuffle(images)

        # Set splitting point
        split_idx = int(len(images) * 0.8)
        
        train_images = images[:split_idx]
        test_images = images[split_idx:]

        # Create new food folders in train and test
        train_class_dir = os.path.join(TRAIN_DIR, class_name)
        test_class_dir = os.path.join(TEST_DIR, class_name)
        os.makedirs(train_class_dir)
        os.makedirs(test_class_dir)

        for img in train_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(train_class_dir, img))
        for img in test_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(test_class_dir, img))

        cntr += 1
        print(f"{cntr}. Processesed {class_name}: {len(train_images)} train, {len(test_images)} test")

print("Dataset split successfully.")
