# **Food Item Recognition**

We will be using the following 3 models to test which one of them is ideal to our current scenario.
*   **SVM**
*   **Random Forest**
*   **CNN**

We need to classify images into 1 of 9 classes using these models.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os

# Get the path to the data folder
data_folder = "/content/drive/MyDrive/Food Item Recognition From Images1/data"

# Get a list of all folders in the data folder
folders = pd.Series(os.listdir(data_folder)).sort_values()

# Print the list of folders
print(folders)


0               1
3               2
2               3
4               4
6               5
5               6
7               7
8               8
9               9
1    category.txt
dtype: object


# **I. Support Vector Machines**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Importing required libraries
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Directory containing the dataset
data_dir = "/content/drive/MyDrive/data"

# List of classes (subdirectories) in the dataset directory
classes = os.listdir(data_dir)

# Total number of classes
num_classes = len(classes)

# Lists to store image data and corresponding labels
X = []  # Image data
y = []  # Labels

# Loop through each class directory
for i, class_name in enumerate(classes):
    # Path to the current class directory
    class_dir = os.path.join(data_dir, class_name)

    # Loop through each image in the class directory
    for img_name in os.listdir(class_dir):
        # Path to the current image
        img_path = os.path.join(class_dir, img_name)

        img = cv2.imread(img_path)

        # Check if the image is successfully loaded and not empty
        if img is not None and img.size != 0:
            # Resize the image to a fixed size (100x100)
            img = cv2.resize(img, (100, 100))
            # Convert the image to grayscale
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # Flatten the image array to convert it into a 1D array
            X.append(img.flatten())
            # Append the label (class index) to the y list
            y.append(i)

# Convert lists to numpy arrays
X = np.array(X)  # Image data
y = np.array(y)  # Labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Support Vector Machine (SVM) classifier with a linear kernel
svm_model = SVC(kernel='linear', decision_function_shape='ovr')

# Train the SVM model using the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)




Accuracy: 0.40252182347235693


# **II. Random Forest**

In [None]:
# Importing required libraries
import cv2
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score

def extract_features(image_path, image_size=(100, 100)):
    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: Unable to load image '{image_path}'")
            return None
        image = cv2.resize(image, image_size)
        # Convert image to grayscale and flatten the array
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        return gray_image.flatten()
    except Exception as e:
        print(f"Error: An exception occurred while processing image '{image_path}': {e}")
        return None

X = []
y = []

with open(os.path.join(data_folder, 'category.txt'), 'r') as category_file:
    next(category_file)  # Skip the header line
    for line in category_file:
        category_id, category_name = line.strip().split('\t')
        category_id = int(category_id)
        category_folder = os.path.join(data_folder, str(category_id))
        # Iterate over images in the category folder
        for image_file in os.listdir(category_folder):
            image_path = os.path.join(category_folder, image_file)
            # Extract features and append to X
            features = extract_features(image_path)
            X.append(features)
            # Append corresponding label to y
            y.append(category_id)

X = np.array(X)
y = np.array(y)
#Split data into training and testing data, 0.8 and 0.2 respectively.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

#Train Random forest classifier on training data
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=32)
rf_classifier.fit(X_train, y_train)

#Evaluation
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

#Using 10 fold cross validation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=32)
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)
print(f"CV Accuracy Scores: {cv_scores}")
print(f"Average CV Accuracy: {np.mean(cv_scores)}")

#Using hyperParameter tuning to find the ideal estimator.
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=32)

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validated accuracy: {:.2f}".format(grid_search.best_score_))

#Testing the best model we got from the grid search's estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(accuracy))

Accuracy: 0.5910418695228822
CV Accuracy Scores: [0.50827653 0.60564752 0.54527751 0.59746589 0.50487329]
Average CV Accuracy: 0.552308147844457
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validated accuracy: 0.59
Test set accuracy: 0.60


Using only the testing, training data we check the accuracy of the model and it gives a moderate 0.58.
Now we move on and add 10 fold cross validation for a more accurate accuracy score: 0.54.
We now want to improve the model more by using hyperparameter tuning and getting the best parameters for a model.

# **III. Convolutional neural networks**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries
import os
import numpy as np
import shutil
import tensorflow as tf
import pickle as pkl
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Data Preparation and Preprocessing
data_dir = "/content/drive/MyDrive/data"


# Check the number of images in each category
for i in range(1, 10):
    path = os.path.join(data_dir, str(i))
    print(f'Category {i} has {len(os.listdir(path))} images.')

Category 1 has 1650 images.
Category 2 has 71 images.
Category 3 has 675 images.
Category 4 has 514 images.
Category 5 has 27 images.
Category 6 has 107 images.
Category 7 has 675 images.
Category 8 has 772 images.
Category 9 has 660 images.


In [None]:
# Function to identify and remove corrupted images
def verify_images(folder_path):
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            img = tf.io.read_file(fpath)
            img = tf.io.decode_image(img)
        except:
            print(f'Removing corrupted image: {fpath}')
            os.remove(fpath)

# Apply the cleaning function to each category
for i in range(1, 10):
    verify_images(os.path.join(data_dir, str(i)))

# Create directories for train, validation, and test sets
sets = ['train', 'val', 'test']
for s in sets:
    set_path = os.path.join(data_dir, s)
    if not os.path.exists(set_path):
        os.makedirs(set_path)
    for i in range(1, 10):
        class_path = os.path.join(set_path, str(i))
        if not os.path.exists(class_path):
            os.makedirs(class_path)

# Split the data into train, validation, and test folders
def split_data(source, dest_train, dest_val, dest_test, split_train=0.8, split_val=0.1):
    files = os.listdir(source)
    np.random.shuffle(files)
    train_idx = int(len(files) * split_train)
    val_idx = int(len(files) * (split_train + split_val))
    for file in files[:train_idx]:
        shutil.copy(os.path.join(source, file), os.path.join(dest_train, file))
    for file in files[train_idx:val_idx]:
        shutil.copy(os.path.join(source, file), os.path.join(dest_val, file))
    for file in files[val_idx:]:
        shutil.copy(os.path.join(source, file), os.path.join(dest_test, file))

# Apply the splitting function
for i in range(1, 10):
    src_folder = os.path.join(data_dir, str(i))
    train_folder = os.path.join(data_dir, 'train', str(i))
    val_folder = os.path.join(data_dir, 'val', str(i))
    test_folder = os.path.join(data_dir, 'test', str(i))
    split_data(src_folder, train_folder, val_folder, test_folder)


In [None]:
# Define the architecture of the CNN model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(150, 150, 3)),  # 32 filters of size 3x3, ReLU activation, input shape of (150, 150, 3)
    MaxPooling2D(2, 2),  # Max pooling with pool size of 2x2
    Conv2D(64, (3,3), activation='relu'),  # 64 filters of size 3x3, ReLU activation
    MaxPooling2D(2, 2),  # Max pooling with pool size of 2x2
    Conv2D(128, (3,3), activation='relu'),  # 128 filters of size 3x3, ReLU activation
    MaxPooling2D(2, 2),  # Max pooling with pool size of 2x2
    Flatten(),  # Flatten the output for dense layers
    Dense(512, activation='relu'),  # Fully connected layer with 512 neurons, ReLU activation
    Dropout(0.5),  # Dropout layer with dropout rate of 0.5
    Dense(9, activation='softmax')  # Output layer with 9 neurons for classification, softmax activation
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up image data generators for training and validation data
train_datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values to the range [0, 1] for training data
val_datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values to the range [0, 1] for validation data

# Flow training images in batches of 32 using train_datagen generator
train_generator = train_datagen.flow_from_directory(
    os.path.join(data_dir, 'train'),  # Path to the training data directory
    target_size=(150, 150),  # Resize images to 150x150
    batch_size=32,  # Batch size of 32
    class_mode='categorical'  # Use categorical labels
)

# Flow validation images in batches of 32 using val_datagen generator
val_generator = val_datagen.flow_from_directory(
    os.path.join(data_dir, 'val'),  # Path to the validation data directory
    target_size=(150, 150),  # Resize images to 150x150
    batch_size=32,  # Batch size of 32
    class_mode='categorical'  # Use categorical labels
)


# Set up Early Stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10)  # Monitor validation loss and stop training if it doesn't improve for 10 epochs


# Calculate steps per epoch and validation steps
steps_per_epoch = len(train_generator.filenames) // train_generator.batch_size  # Dynamically adjust steps per epoch based on training data size and batch size
validation_steps = len(val_generator.filenames) // val_generator.batch_size  # Dynamically adjust validation steps based on validation data size and batch size


# Train the model
history = model.fit(
    train_generator,  # Training data generator
    steps_per_epoch=steps_per_epoch,  # Dynamically adjusted steps per epoch
    epochs=50,  # Train for 50 epochs
    validation_data=val_generator,  # Validation data generator
    validation_steps=validation_steps,  # Dynamically adjusted validation steps
    callbacks=[early_stopping]  # Early stopping callback
)
model_path = os.path.join('/content/drive/MyDrive', 'best_model.pkl')
pickle.dump(best_model, open(model_path, 'wb'))


Found 4950 images belonging to 9 classes.
Found 984 images belonging to 9 classes.
Epoch 1/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 2s/step - accuracy: 0.3403 - loss: 1.8998 - val_accuracy: 0.5729 - val_loss: 1.2209
Epoch 2/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3750 - loss: 1.6413 - val_accuracy: 0.7083 - val_loss: 0.9884
Epoch 3/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 2s/step - accuracy: 0.5951 - loss: 1.1131 - val_accuracy: 0.7385 - val_loss: 0.7650
Epoch 4/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7188 - loss: 0.8720 - val_accuracy: 0.7500 - val_loss: 0.7374
Epoch 5/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 2s/step - accuracy: 0.7263 - loss: 0.7609 - val_accuracy: 0.8125 - val_loss: 0.5350
Epoch 6/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7188

In [None]:
# Evaluate the model on the test set
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    os.path.join(data_dir, 'test'),
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

test_loss, test_acc = model.evaluate(test_generator)
print(f'Test accuracy: {test_acc}')



Found 972 images belonging to 9 classes.
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 736ms/step - accuracy: 0.9529 - loss: 0.1787
Test accuracy: 0.9578189253807068
