# Automatic Labeling
Use the classifier created in the previous notebook to label pictures of food. We will use these labels to filter the data so we can eventually predict the cuisine on only food pictures.

In [1]:
import pandas as pd
import json

# Import json file
with open('dataset.json') as file:
    data = json.load(file)

# Convert json to pandas dataframe
df = pd.json_normalize(data)

# Select only relevant columns
df = df[['more_details.full_images']].copy()

# Row for each image_id
df = df.explode('more_details.full_images')
df.reset_index(drop=True, inplace=True)
df['image_id'] = pd.json_normalize(df['more_details.full_images'])['image_id']
df.dropna(subset=['image_id'], inplace=True)
df.reset_index(drop=True, inplace=True)
df = df[['image_id']].copy()
df['image_id'] = df['image_id'].astype(str)
df['image_id'] = df['image_id'].apply(lambda x: x + '.jpg')

Check for corrupted images and remove them

In [2]:
import os
from PIL import Image
import numpy as np

# Count number of corrupted images
corrupted_images = []
for image_id in df['image_id']:    
    try:
        img = Image.open(os.path.join('images/' + str(image_id)))
        img.verify()
    except:
        corrupted_images.append(image_id)
print(corrupted_images) # ['4215425.jpg']

In [2]:
# Save time by skipping the above step
corrupted_images = ['4215425.jpg']

# Drop corrupted images
df = df[df.image_id.isin(corrupted_images) == False].copy()

In [3]:
df.to_csv('all_image_ids.csv', index=False)

Load the model

In [4]:
from keras.models import load_model

model = load_model('food-classifier_resnet50.h5')
labels = ['food', 'non-food']

Define function for labeling

In [5]:
# Classify images and add label to dataframe
from keras.preprocessing import image
from tensorflow.keras.preprocessing import image
import numpy as np

# Function to classify image
def classify_image(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0) / 255
    pred = model.predict(img)
    pred_label = labels[np.argmax(pred)]
    return pred_label

In [7]:
chunk_size = 1000

# Classify images in chunks
for chunk in pd.read_csv('all_image_ids.csv', chunksize=chunk_size):
    chunk['label'] = chunk['image_id'].apply(lambda x: classify_image('images/' + x))
    chunk.to_csv('image_labels.csv', mode='a', header=False, index=False)


