# 1. Data Acquisition and Preprocessing

In [None]:
# Data manipulation
import pandas as pd

# Libraries for API querying
import json
import requests
from pprint import pprint

# Imports for image scaling and file access
import os
from PIL import Image
from tqdm import tqdm

# For splitting into train and test
import random
import shutil

# For PyTorch
import torch
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

# For EDA
import matplotlib.pyplot as plt
from collections import Counter

# For calculations
import math

To be loading API keys and related secret value we will be using .env files.

In [None]:
from dotenv import load_dotenv

load_dotenv()

A helper function which will be useful later on in nutrition data retrieval will be to get a list of all the food names present in our dataset.

In [None]:

def get_food_names():
    data_path = 'food-128'
    food_names = []

    # Extract food names from folders
    for food_name in os.listdir(data_path):
        full_path = os.path.join(data_path, food_name)
        if os.path.isdir(full_path):
            food_names.append(food_name.replace('_', ' '))
    return food_names
print(get_food_names())

Downscaling the images will help massively in getting a working model without needing to use the full image data. The method to do so is shown below.

In [None]:
# Downscale images (ChatGPT)
# Take our raw images

# Source and target directories
SRC_DIR = "images"        # original images folder with class subfolders
DST_DIR = "food-128"      # output folder

size = (128, 128)  # target resolution

# Create output root directory
os.makedirs(DST_DIR, exist_ok=True)

# Loop through every class folder
for class_name in os.listdir(SRC_DIR):
    src_class_path = os.path.join(SRC_DIR, class_name)
    dst_class_path = os.path.join(DST_DIR, class_name)

    # Skip non-directories (just in case)
    if not os.path.isdir(src_class_path):
        continue

    os.makedirs(dst_class_path, exist_ok=True)

    # Resize each image
    for img_name in tqdm(os.listdir(src_class_path), desc=f"Resizing {class_name[:20]}"):
        src_img_path = os.path.join(src_class_path, img_name)
        dst_img_path = os.path.join(dst_class_path, img_name)

        try:
            img = Image.open(src_img_path).convert("RGB")
            img = img.resize(size, Image.Resampling.BILINEAR)
            img.save(dst_img_path, quality=90)
        except Exception as e:
            print(f"Skipped {src_img_path}: {e}")

print("All images resized to 128x128 and saved in food-128")

### FoodData Central API Loading

For loading the API data, we'll first need to define some constants which will be used through data retrieval.

In [None]:
# API Setup
FDC_BASE_URL = 'https://api.nal.usda.gov/fdc/v1'
FDC_API_KEY = os.getenv('FDC_API_KEY')

To search the FoodData Central database, we'll use the following helper functions:

In [None]:
def search_food(query, data_type='Survey (FNDDS)', limit=10):
    # data_type can be any of 'Branded', 'Foundation', 'Survey (FNDDS)', 'SR Legacy'
    parameters = {
        'api_key': FDC_API_KEY,
        'query': query,
        'dataType': data_type,
        'pageSize': limit
    }
    response = requests.get(url=f'{FDC_BASE_URL}/foods/search', params=parameters)
    response.raise_for_status()
    return response.json()


def get_calories(food):
    for nutrient in food.get('foodNutrients', []):
        # TODO: test if {foodNutrientId': 34350077, nutrientName: 'Energy'} is the correct dict
        if nutrient.get('nutrientName') == 'Energy' and nutrient.get('unitName') == 'KCAL':
            return nutrient['value']
    return None


def simplify_results(data):
    foods = data.get('foods', [])
    result_foods = []
    for food in foods:
        calories = get_calories(food)
        if calories is not None:
            result_foods.append({
                'name': food['description'],
                'calories': calories
            })
    return result_foods


def estimate_calories_from_results(results, top_n=5):
    if not results:
        return None
    
    # Get the first `top_n` calorie values that aren't None
    calories = [r['calories'] for r in results if r.get('calories') is not None][:top_n]
    if not calories:
        return None
    
    return round(sum(calories) / len(calories), 1)

To test the helper functions' ability to query the API, we can search with a given dish name.

In [None]:
# Since the API calls could fail, use a try except block
try:
    food = input('Type a food:')
    response = search_food(food)
    simplified = simplify_results(response)
    print(len(simplified))
    pprint(simplified)
except Exception as e:
    print('Error:', e)

Now, we'll need to refine this data to contain only the relevant caloric information for the foods in our dataset. An important aspect of this to note is that the FoodData Central API is limited to 1000 requests per hour by IP address, and will block an IP address for one hour if this limit is exceded.

In [None]:
try:
    food_names = get_food_names()
    all_calorie_data = []
    for food in food_names:
        response = search_food(food)
        simplified = simplify_results(response)
        estimate = estimate_calories_from_results(simplified, top_n=5)
        print(f"\n{food}: ~{estimate} kcal/100g")
        all_calorie_data.append({
            'food': food,
            'estimate_calories_per_100g': estimate,
            'results': simplified
        })
except Exception as e:
    print("Error:", e)


With the previous code block, we now have a list of the results of the keyword search for all 101 dishes along with an estimate for kcal/100g. However, some of these contained no entries at all.

In [None]:
# Count and record missing dishes
missing_dishes = []
for i in range(len(food_names)):
    if all_calorie_data[i].get('estimate_calories_per_100g') == None:
        missing_dishes.append(food_names[i])

print('Missing dish count:', len(missing_dishes))
pprint(missing_dishes)

To supplement the data which doesn't have an entry in the FoodData Central API, we'll be using the Nutritionix API.

In [None]:
NUTRITIONIX_URL = 'https://trackapi.nutritionix.com/v2/natural/nutrients'
NUTRITIONIX_APP_ID = os.getenv('NUTRITIONIX_APP_ID')
NUTRITIONIX_APP_KEY = os.getenv('NUTRITIONIX_APP_KEY')

In [None]:
def get_calories_nutritionix(query):
    headers = {
        'x-app-id': NUTRITIONIX_APP_ID,
        'x-app-key': NUTRITIONIX_APP_KEY,
        'Content-Type': 'application/json'
    }
    body = {
        "query": query
    }

    try:
        response = requests.post(NUTRITIONIX_URL, headers=headers, json=body)
        response.raise_for_status()
    except Exception as e:
        print('Error:', e)
        return None
    
    data = response.json()
    
    foods = data.get("foods", [])
    if not foods:
        print("No nutrition data found for", query)
        return None

    # Extract calorie info from first result
    f = foods[0]
    cal = f.get("nf_calories", 0)
    weight = f.get("serving_weight_grams", 100)
    cal_per_100g = cal * (100 / weight)

    return round(cal_per_100g, 1)

When storing this with the rest of our data which had FoodData Central simplified results included, we'll use `None` as the value for the `'results'` kay in the food dictionary.

In [None]:
for food in missing_dishes:
    estimate = get_calories_nutritionix(food)
    all_calorie_data.append({
            'food': food,
            'estimate_calories_per_100g': estimate,
            'results': None
        })
    print(f'{food}: ', estimate)

In [None]:
for food_entry in all_calorie_data:
    pprint(food_entry)

For easier use later on, we'll save the calorie estimate information to a CSV file.

In [None]:
# Create a dataframe then store to CSV
rows = [
    {'food': entry['food'], 'kcal/100g': entry['estimate_calories_per_100g']}
    for entry in all_calorie_data
]

calorie_df = pd.DataFrame(rows, columns=['food', 'kcal/100g'])
print(calorie_df.info())

calorie_df.to_csv('data/calories.csv', index=False)

Turn Image Data Into Train and Test Sets


In [None]:
SRC_DIR = 'food-128'

TRAIN_DIR = 'food-128-split/train'
TEST_DIR = 'food-128-split/test'

# Output directories
os.makedirs(TRAIN_DIR)
os.makedirs(TEST_DIR)

cntr = 0
# Loop through each food subclass
for class_name in os.listdir(SRC_DIR):
    class_path = os.path.join(SRC_DIR, class_name)

    # Ensure its a folder
    if os.path.isdir(class_path):
        images = os.listdir(class_path)

        # Shuffle images
        random.shuffle(images)

        # Set splitting point
        split_idx = int(len(images) * 0.8)
        
        train_images = images[:split_idx]
        test_images = images[split_idx:]

        # Create new food folders in train and test
        train_class_dir = os.path.join(TRAIN_DIR, class_name)
        test_class_dir = os.path.join(TEST_DIR, class_name)
        os.makedirs(train_class_dir)
        os.makedirs(test_class_dir)

        for img in train_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(train_class_dir, img))
        for img in test_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(test_class_dir, img))

        cntr += 1
        print(f"{cntr}. Processesed {class_name}: {len(train_images)} train, {len(test_images)} test")

print("Dataset split successfully.")


Convert Images to new format using PyTorch

In [None]:
# Image transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        # These numbers come from ImageNet
        # Represent mean and standard deviation values for RGB based on a massive dateset of color distribution in natural images
        mean = [0.485, 0.456, 0.406],
        std = [0.229, 0.224, 0.225]
    )
])

# Load training and test sets
train_dataset = datasets.ImageFolder(root = 'food-128-split/train', transform = transform)
test_dataset = datasets.ImageFolder(root = 'food-128-split/test', transform = transform)

# Create PyTorch data loaders for later
train_loader = DataLoader(
    train_dataset, 
    batch_size = 32, # use 32 images per batch
    shuffle = True, # shuffle training data each time to not get false patterns
    num_workers = 4 # use 4 CPU threads in parallel to speed up work
)

test_loader = DataLoader(
    test_dataset,
    batch_size = 32, # same as above
    shuffle = False, # don't shuffle test data for consistent benchmarks
    num_workers = 4 # same as above
)

# Verify stuff
class_cnt = len(train_dataset.classes)
# Get class count
print(f"Num Classes: {class_cnt}")
# See a few just to verify names saved correctly
print(f"Sample Classes: {train_dataset.classes[:10]}")


# 2. Exploratory Data Analysis


## Class Calorie Estimate EDA

In [None]:
# Calorie data loading and summary statistics
CALORIE_FILE_PATH = 'data/calories.csv'
calorie_df = pd.read_csv(CALORIE_FILE_PATH)

print(calorie_df.info())
print(calorie_df.describe())
print(calorie_df.head(5))

In [None]:
# Class Calorie Visualization
# Begin by sorting df by 'kcal/100g'
calorie_df.sort_values('kcal/100g', ascending=False, inplace=True)
calorie_df.reindex()

# Create bar chart of the kcal/100g estimates for each food
plt.figure(figsize=(5, 18))
plt.title('Bar Chart of Calorie Estimates')
plt.barh(calorie_df['food'], calorie_df['kcal/100g'])
plt.ylabel('Food Name')
plt.xlabel('Estimated kcal/100g')
plt.margins(y = 0.005)

plt.show()

## Image Data EDA
Generate visuals for the average pixel values of each food class.

In [None]:
# Convert Images to tensor (fancy 4D array)
transform = transforms.Compose([
    transforms.ToTensor()
])

# Create dataset using this transform
dataset = datasets.ImageFolder(root='food-128-split/train', transform = transform)

# Get names of all fodos
food_names = dataset.classes
total_num_foods = len(food_names) # should always be 101 but just in case

# 7 distinct food to visualize
selected_classes = [
    'sushi',
    'guacamole',
    'spaghetti_bolognese',
    'macarons',
    'chocolate_cake',
    'caesar_salad',
    'strawberry_shortcake',
    'beef_carpaccio'
]
food_count = len(selected_classes)

# Map class names to integer labels used by ImageFolder
selected_indices = [dataset.class_to_idx[c] for c in selected_classes]

# Filter dataset to only images with those labels
subset_indices = [
    i for i, (_, label) in enumerate(dataset.samples)
    if label in selected_indices
]
subset_dataset = Subset(dataset, subset_indices) # create subset of just selected indices

# Create data loader because its much faster
subset_loader = DataLoader(
    subset_dataset,
    batch_size=32,   # process 32 images at once
    shuffle=False,   # don't randomize order for this
    num_workers=4    # load using 4 CPU threads in parallel
)

# Get tensors ready to find averages
H, W = 128, 128
# Track sum of R,G,B values for each pixel position
# Torch.zeros is 4D array we can use to keep values for each pixel distinct
sums = torch.zeros((food_count, 3, H, W))
# Keep track of how many images you've seen in current food class (again should always be 101 for our case)
counts = torch.zeros(food_count)
# Keep track of running sum of squares
sum_sq = torch.zeros((food_count, 3, H, W))

# Map original labels (0-100) to new compact label (0..food_count-1)
class_map = {orig_label: i for i, orig_label in enumerate(selected_indices)}

# Load through batches and add up pixel values
for images, labels in subset_loader: # images is tensor of batch size, 3, H, W --- lables is tensor of batch size with label for each image (will be the same label except when transitioning between food types)
    for i in range(images.size(0)):  # iterate through each image in batch
        label = labels[i].item() # get current image label
        if label in class_map:
            idx = class_map[label] # get idx in new smaller array
            sums[idx] += images[i] # add current image pixel values
            counts[idx] += 1 # track that we've added another image worth of pixels for later division
            sum_sq[idx] += images[i] ** 2

# Calculate average image (pixel-wise division)
# view reshapes counts into food_count, 1, 1, 1 so it works correctly when dividing into 4D sums tensor
avg_images = (sums / counts.view(-1, 1, 1, 1))

# Pixel-wise standard deviation
std_images = torch.sqrt((sum_sq / counts.view(-1, 1, 1, 1)) - (avg_images ** 2))

# Get std between different classes
interclass_std = torch.std(avg_images, dim=0)

# Calculate average pixel for each average image
avg_pixels = avg_images.mean(dim=(2, 3)) # avg_pixels is a tensor of batch size, 3, where each of the values are the means from dims H and W or avg_images

# Visualize average images
fig, axes = plt.subplots(math.ceil(food_count / 2), 2, figsize=(5, math.ceil(food_count / 2) * 2.5))
axes = axes.flatten()

for i, orig_label in enumerate(selected_indices):
    avg_img = avg_images[i].permute(1, 2, 0).numpy()  # Pytorch keeps images in Color, Height, Width, Order but matplotlib uses Height, Width, Color
    axes[i].imshow(avg_img)
    axes[i].set_title(food_names[orig_label], fontsize=12)
    axes[i].axis("off")

plt.tight_layout()
plt.show()

# Visualize average pixel channel values
fig, axes = plt.subplots(math.ceil(food_count / 2), 2, figsize=(6, math.ceil(food_count / 2) * 3))
axes = axes.flatten()

for i, orig_label in enumerate(selected_indices):
    channel_names = ['R', 'G', 'B']
    values = [avg_pixels[i, 0], avg_pixels[i, 1], avg_pixels[i, 2]]
    colors = ['red', 'green', 'blue']
    
    axes[i].bar(channel_names, values, color=colors)
    axes[i].set_title(food_names[orig_label])
    axes[i].set_xlabel('Color Channel')
    axes[i].set_ylabel('Value')
    axes[i].set_xmargin(0.1)
    axes[i].set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Compute overall std value for each class
class_std_values = std_images.mean(dim=(1, 2, 3))

# Plot std values for deviation in average color between classes
# Possible future improvement could be using a circlular subset of the image to get the average color to avoid background noise
plt.figure(figsize=(8, 4))
plt.barh([food_names[i] for i in selected_indices], class_std_values.numpy())
plt.xlabel("Average Standard Deviation")
plt.title("Overall Color Variability per Food Class")
plt.tight_layout()
plt.show()

# Convert to numpy image and reorder to HWC for matplotlib
interclass_std_img = interclass_std.permute(1, 2, 0).numpy()
# Normalize for better contrast (to highlight areas with most deviation)
interclass_std_img = interclass_std_img / interclass_std_img.max()

plt.figure(figsize=(4, 4))
plt.imshow(interclass_std_img)
plt.title("Standard Deviation Across Classes")
plt.axis("off")
plt.show()

Basic Numerical Summaries for Entire Data

In [None]:
# Number of channels and image dimensions (assume all images have same shape)
sample_img, _ = dataset[0]
num_channels, H, W = sample_img.shape

# Count images per class
labels = [label for _, label in dataset]
counts_per_class = Counter(labels)

# Map label index to class name
class_names = dataset.classes

# Print summary
print(f"Total images: {len(dataset)}")
print(f"Image shape: {num_channels} channels, {H}x{W} pixels")
print("Number of images per class:")
for label, count in counts_per_class.items():
    print(f"  {class_names[label]}: {count}")