In [1]:
import pandas as pd

# Replace 'your_file.xlsx' with the path to your Excel file
file_path = 'dataframe_input.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Now 'df' contains your Excel data in a DataFrame format
df.head(10)

Unnamed: 0,dish_id,path_filename,ingr_1_name,ingr_2_name,ingr_3_name,ingr_4_name,ingr_5_name
0,dish_1556572657,dish_1556572657/rgb.png,olives,,,,
1,dish_1556573514,dish_1556573514/rgb.png,mixed greens,,,,
2,dish_1556575014,dish_1556575014/rgb.png,olives,,,,
3,dish_1556575083,dish_1556575083/rgb.png,brussels sprouts,,,,
4,dish_1556575124,dish_1556575124/rgb.png,celery,,,,
5,dish_1561737271,dish_1561737271/rgb.png,sausage,hash browns,pineapple,broccoli,egg whites
6,dish_1561737293,dish_1561737293/rgb.png,broccoli,cantaloupe,sausage,pineapple,egg whites
7,dish_1561737776,dish_1561737776/rgb.png,bagels,cream cheese,,,
8,dish_1561737839,dish_1561737839/rgb.png,berries,,,,
9,dish_1561739160,dish_1561739160/rgb.png,berries,,,,


In [2]:
df.columns

Index(['dish_id', 'path_filename', 'ingr_1_name', 'ingr_2_name', 'ingr_3_name',
       'ingr_4_name', 'ingr_5_name'],
      dtype='object')

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

# Handle Missing Values
df.fillna('01_erase_me', inplace=True)

# Tokenize Ingredients
ingredients = df[['ingr_1_name', 'ingr_2_name', 'ingr_3_name', 'ingr_4_name', 'ingr_5_name']].values.tolist()

# Flatten list of ingredients
all_ingredients = [ingredient for sublist in ingredients for ingredient in sublist]

# Prepare Labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(ingredients)

# Convert labels to boolean
labels = labels.astype(bool)

# Add encoded labels to DataFrame
for i, label in enumerate(mlb.classes_):
    df[label] = labels[:, i]

# Drop original ingredient columns
df.drop(columns=['ingr_1_name', 'ingr_2_name', 'ingr_3_name', 'ingr_4_name', 'ingr_5_name'], inplace=True)

# You can now use df for your deep learning model
df.head(10)

Unnamed: 0,dish_id,path_filename,01_erase_me,bagels,berries,broccoli,brussels sprouts,cantaloupe,celery,cream cheese,egg whites,hash browns,mixed greens,olives,pineapple,sausage
0,dish_1556572657,dish_1556572657/rgb.png,True,False,False,False,False,False,False,False,False,False,False,True,False,False
1,dish_1556573514,dish_1556573514/rgb.png,True,False,False,False,False,False,False,False,False,False,True,False,False,False
2,dish_1556575014,dish_1556575014/rgb.png,True,False,False,False,False,False,False,False,False,False,False,True,False,False
3,dish_1556575083,dish_1556575083/rgb.png,True,False,False,False,True,False,False,False,False,False,False,False,False,False
4,dish_1556575124,dish_1556575124/rgb.png,True,False,False,False,False,False,True,False,False,False,False,False,False,False
5,dish_1561737271,dish_1561737271/rgb.png,False,False,False,True,False,False,False,False,True,True,False,False,True,True
6,dish_1561737293,dish_1561737293/rgb.png,False,False,False,True,False,True,False,False,True,False,False,False,True,True
7,dish_1561737776,dish_1561737776/rgb.png,True,True,False,False,False,False,False,True,False,False,False,False,False,False
8,dish_1561737839,dish_1561737839/rgb.png,True,False,True,False,False,False,False,False,False,False,False,False,False,False
9,dish_1561739160,dish_1561739160/rgb.png,True,False,True,False,False,False,False,False,False,False,False,False,False,False


In [4]:
df.drop(columns=['01_erase_me'], inplace = True)
df.head(10)

Unnamed: 0,dish_id,path_filename,bagels,berries,broccoli,brussels sprouts,cantaloupe,celery,cream cheese,egg whites,hash browns,mixed greens,olives,pineapple,sausage
0,dish_1556572657,dish_1556572657/rgb.png,False,False,False,False,False,False,False,False,False,False,True,False,False
1,dish_1556573514,dish_1556573514/rgb.png,False,False,False,False,False,False,False,False,False,True,False,False,False
2,dish_1556575014,dish_1556575014/rgb.png,False,False,False,False,False,False,False,False,False,False,True,False,False
3,dish_1556575083,dish_1556575083/rgb.png,False,False,False,True,False,False,False,False,False,False,False,False,False
4,dish_1556575124,dish_1556575124/rgb.png,False,False,False,False,False,True,False,False,False,False,False,False,False
5,dish_1561737271,dish_1561737271/rgb.png,False,False,True,False,False,False,False,True,True,False,False,True,True
6,dish_1561737293,dish_1561737293/rgb.png,False,False,True,False,True,False,False,True,False,False,False,True,True
7,dish_1561737776,dish_1561737776/rgb.png,True,False,False,False,False,False,True,False,False,False,False,False,False
8,dish_1561737839,dish_1561737839/rgb.png,False,True,False,False,False,False,False,False,False,False,False,False,False
9,dish_1561739160,dish_1561739160/rgb.png,False,True,False,False,False,False,False,False,False,False,False,False,False


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
train_df.drop(columns=['dish_id', 'path_filename'])

Unnamed: 0,bagels,berries,broccoli,brussels sprouts,cantaloupe,celery,cream cheese,egg whites,hash browns,mixed greens,olives,pineapple,sausage
5,False,False,True,False,False,False,False,True,True,False,False,True,True
0,False,False,False,False,False,False,False,False,False,False,True,False,False
7,True,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False,False
9,False,True,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False
6,False,False,True,False,True,False,False,True,False,False,False,True,True


In [7]:
train_df.drop(columns=['dish_id', 'path_filename']).dtypes

bagels              bool
berries             bool
broccoli            bool
brussels sprouts    bool
cantaloupe          bool
celery              bool
cream cheese        bool
egg whites          bool
hash browns         bool
mixed greens        bool
olives              bool
pineapple           bool
sausage             bool
dtype: object

In [8]:
from IPython.display import Image, display
import os

# Assuming you have the DataFrame 'train_df' containing file paths
# Let's say you want to display the first image in the DataFrame
image_path = train_df.iloc[0]['path_filename']
image_path
full_path = os.path.join(os.getcwd(), image_path)

full_path
if os.path.exists(full_path):
    print("File exists.")
else:
    print("File does not exist.")


File exists.


In [9]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models

# Create data generators for images
'''train_datagen is used to generate data for training the model by reading images from a directory, performing transformations, and normalization.

ImageDataGenerator is a class in TensorFlow/Keras that generates batches of augmented/normalized data from image data.
It provides a flexible way to preprocess and augment images on-the-fly during training, without needing to pre-process and store all the images in memory.'''
train_datagen = ImageDataGenerator(rescale=1./255) #used below in train_generator
test_datagen = ImageDataGenerator(rescale=1./255)  #used below in test_generator

'''The train_generator generates batches of augmented/normalized data from image data and labels in the form of a DataFrame.
In the provided code, the train_generator is specifically responsible for generating training data for the model.
It generates batches of data from the DataFrame train_df, which contains information about image paths and corresponding labels (ingredients in this case).'''

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='path_filename',
    y_col=train_df.drop(columns=['dish_id', 'path_filename']),
    target_size=(640, 480),
    batch_size=10,
    class_mode='raw'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='path_filename',
    y_col=test_df.drop(columns=['dish_id', 'path_filename']),
    target_size=(640, 480),
    batch_size=10,
    class_mode='raw'
)


#Modified for multi-label classification:
'''Each node in the output layer must use the SIGMOID activation.
This will predict a probability of class membership for the label, a value between 0 and 1.
Finally, the model must be fit with the BINARY CROSS-ENTROPY loss function.
https://machinelearningmastery.com/multi-label-classification-with-deep-learning/'''

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(640, 480, 3), padding='same'), #32 kernels with size 3,3
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),

    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='sigmoid')           #num_classes is = number of ingredients
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_generator, epochs=10, validation_data=test_generator)

2024-03-07 10:13:16.409649: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 10:13:16.409782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 10:13:16.492371: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Found 8 validated image filenames.
Found 2 validated image filenames.


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).