# Image Classification

Start by installing all the necessary dependencies in the virtual environment in Python with pip

In [1]:
pip install -U datasets transformers torchvision accelerate tensorflow-macos tensorflow-metal tf-keras numpy matplotlib torch timm scikit-learn ipykernel notebook jupyterlab gradio ipywidgets --quiet

Note: you may need to restart the kernel to use updated packages.


## 1 CNN


## 1.1 File paths

Specify the filepaths to where the dataset, models etc should be stored.

### Warning!!! Please change the following paths before running the notebooks to ensure there are no issues:
1. cnn_model_dir 
2. vit_model_dir

I changed them to be placed on an External HDD as I have not enough space. You may place them wherever you have space but ideally within the same folder like cnn_model_dir = "./cnn-checkpoints"

In [None]:
# General Paths
amazon_reviews_path = "./amazon2023/All_Beauty.jsonl.gz"
text_model_dir = "./bert_sentiment_best"

# CNN Paths
cnn_model_dir = "./cnn-checkpoints"
# cnn_model_dir = "/Volumes/XuanYi's T7/cnn-checkpoints"
base_dir_cnn = './amazon2023-images/problem_cnn'

# VIT Paths
vit_model_dir = "./vit-checkpoints"
# vit_model_dir = "/Volumes/XuanYi's T7/vit-checkpoints"
base_dir_vit = './amazon2023-images/problem_vit'

# 2 Data Loading

In [3]:
# Import Libraries for Loading the Dataset and Processing the images
import pandas as pd
import numpy as np
import os
import requests
from PIL import Image
from io import BytesIO

In [4]:
# Ingest all the Amazon Beauty Product Reviews as a dataframe
amazon_df = pd.read_json(amazon_reviews_path, lines=True)

# Filter rows with images only
df_with_images = amazon_df[amazon_df['images'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df_with_images.head(5)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
5,4,Pretty Color,The polish was quiet thick and did not apply s...,[{'small_image_url': 'https://images-na.ssl-im...,B00R8DXL44,B00R8DXL44,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,2020-08-27 22:30:08.138,0,True
7,3,Meh,These were lightweight and soft but much too s...,[{'small_image_url': 'https://m.media-amazon.c...,B088SZDGXG,B08BBQ29N5,AEYORY2AVPMCPDV57CE337YU5LXA,2021-10-15 05:20:59.292,0,True
45,5,Nice colors and dries fast,Haven’t worn it very long yet so can’t comment...,[{'small_image_url': 'https://images-na.ssl-im...,B07H281V4V,B07H281V4V,AHW7W34BLHHC4AYM4TPMLA2SWMMA,2020-12-31 17:13:27.770,0,True
56,5,Absolutely beautiful,These diamond are absolutely beautiful and shi...,[{'small_image_url': 'https://m.media-amazon.c...,B095SC4J8T,B095SC4J8T,AH4CGRSYSW5CWLRGQYRZKNJBUPAA,2021-08-17 02:11:43.947,0,True
106,5,Doesn't Sting or Burn My Eyes!,I am in my late 40's and started using anti-ag...,[{'small_image_url': 'https://images-na.ssl-im...,B01CO73OIQ,B01CO73OIQ,AHV6QCNBJNSGLATP56JAWJ3C4G2A,2016-06-28 14:13:38.000,8,True


In [5]:
# Map the Amazon ratings provided by users to a flag for images
def map_rating_to_label(rating):
    if rating in [1, 2]:
        return "problematic"
    elif rating == 3:
        return "uncertain"
    else:  
        # 4 or 5
        return "not_problematic"

# Flatten out image URLs and associate each with its rating
image_rows = []
for idx, row in df_with_images.iterrows():
    rating = int(row['rating'])
    label = map_rating_to_label(rating)

    # Iterate through every row to extract the labels for each image and associate it with the image
    for img_dict in row['images']:
        url = img_dict.get('small_image_url')
        if url:
            image_rows.append({'url': url, 'label': label})

# Convert to a Dataframe and verify the dat is what I expected
amazon_images_df = pd.DataFrame(image_rows)
amazon_images_df.head(n=2)


Unnamed: 0,url,label
0,https://images-na.ssl-images-amazon.com/images...,not_problematic
1,https://m.media-amazon.com/images/I/81FN4c0VHz...,uncertain


In [6]:
"""
Checks if the CNN & VIT folders exist and are non-empty, counts the number of images in each, and downloads images if needed, saving them in two different sizes for CNN and ViT models.
"""
# Check if both folders exist and are non-empty
def folder_has_images(folder):
    if not os.path.exists(folder):
        return False
    for subfolder in os.listdir(folder):
        subfolder_path = os.path.join(folder, subfolder)
        if os.path.isdir(subfolder_path) and len(os.listdir(subfolder_path)) > 0:
            return True
    return False

# Count the number of files in each folder for verification
def count_files_in_folder(folder):
    total = 0
    for subfolder in os.listdir(folder):
        subfolder_path = os.path.join(folder, subfolder)
        if os.path.isdir(subfolder_path):
            total += len([f for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))])
    return total

# Check if we need to download
if folder_has_images(base_dir_cnn) and folder_has_images(base_dir_vit):
    print("Image folders already exist and contain images. Skipping download.")
    # Print the number of files we have
    print(f"Number of CNN images: {count_files_in_folder(base_dir_cnn)}")
    print(f"Number of ViT images: {count_files_in_folder(base_dir_vit)}")
else:
    # Empty - Download the training and test dataset
    os.makedirs(base_dir_cnn, exist_ok=True)
    os.makedirs(base_dir_vit, exist_ok=True)

    for i, row in amazon_images_df.iterrows():
        label = row['label']
        url = row['url']

        save_dir_cnn = f'{base_dir_cnn}/{label}'
        save_dir_vit = f'{base_dir_vit}/{label}'

        os.makedirs(save_dir_cnn, exist_ok=True)
        os.makedirs(save_dir_vit, exist_ok=True)

        # Check if image already exists before downloading
        cnn_img_path = f'{save_dir_cnn}/{i}.jpg'
        vit_img_path = f'{save_dir_vit}/{i}.jpg'
        if os.path.exists(cnn_img_path) and os.path.exists(vit_img_path):
            continue

        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content)).convert('RGB')
                img_cnn = img.resize((128, 128))
                img_cnn.save(cnn_img_path)
                img_vit = img.resize((224, 224))
                img_vit.save(vit_img_path)
        except Exception as e:
            print(f"Failed to download {url}: {e}")


Image folders already exist and contain images. Skipping download.
Number of CNN images: 25481
Number of ViT images: 25482


**Load images for CNN Training first**

- DataGenerator will help to load, preprocess and augment images as model trains like rescaling, rotation to increase diversity of dataset without increasing the number of dataset

In [7]:
# CNN Libraries
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
# Creates a Keras ImageDataGenerator to preprocess and augment image data if necessary for CNN, with 20% as test dataset
cnn_data_generator = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Loads dataset in batches
# Loads images from a folder structure where each subfolder represents a class label
train_cnn_data_generator = cnn_data_generator.flow_from_directory(
    base_dir_cnn,
    # Resize the image
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    # Use 80-% of dataset for training
    subset='training'
)

# Same as above, but for testing purposes
test_cnn_data_generator = cnn_data_generator.flow_from_directory(
    base_dir_cnn,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

Found 20386 images belonging to 3 classes.
Found 5095 images belonging to 3 classes.


In [9]:
# CNN Architecture - Using a Pre-Trained CNN ResNet which is used for Image Classification
# Remove the top (i.e. original classification head) to classify the pre-trained classes
base_cnn_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224,224,3))
# Freeze base model for initial training
base_cnn_model.trainable = False  

# Add my own classification head as the last year to predict 1 of 3 labels
cnn_model = Sequential([
    base_cnn_model,
    # Converts output of base model into 1D vector
    Flatten(),
    # For learning patterns
    Dense(128, activation='relu'),
    # Prevent overfitting by setting 50% to 0
    Dropout(0.5),
    # Final layer for classification
    Dense(3, activation='softmax')
])

2025-10-18 16:17:39.140109: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-10-18 16:17:39.140321: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-10-18 16:17:39.140338: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-10-18 16:17:39.140680: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-18 16:17:39.140702: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
"""
Computes class weights in case dataset is imbalanced since Amazon usually is biased, a lot of bad or good reviews
"""
# Maps classnames to integer labels
class_indices = train_cnn_data_generator.class_indices
classes = list(class_indices.values())

# Get all int labels
labels = train_cnn_data_generator.classes

# Maps class weight -> label
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights_dict = dict(zip(np.unique(labels), class_weights))

In [11]:
# Ensure the best model trained is saved and not the most recent one trained
checkpoint = ModelCheckpoint(
    filepath=f"{cnn_model_dir}/cnn_best_trained_model.keras",
    # Checks validation loss metric, which is usaed to determine beest model
    monitor='val_loss', 
    # Save model iif val loss improves
    save_best_only=True, 
    # Save entire model
    save_weights_only=False,
    # lower val loss is better ofc
    mode='min',
    verbose=1
)

# Use AdamW optimiser for ML, and tracks accuracy for training and validation
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with generators for 20 epochs, i.e. see data 20 times
cnn_model_training_history = cnn_model.fit(
    train_cnn_data_generator,
    validation_data=test_cnn_data_generator,
    epochs=20,
    # Adjust loss function for imbalanced classes
    class_weight=class_weights_dict,
    # Save best model only
    callbacks=[checkpoint],
)

Epoch 1/20


  self._warn_if_super_not_called()
2025-10-18 16:17:42.717016: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - accuracy: 0.3474 - loss: 48.9339
Epoch 1: val_loss improved from None to 2.97391, saving model to /Volumes/XuanYi's T7/cnn-checkpoints/cnn_best_trained_model.keras
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 216ms/step - accuracy: 0.3361 - loss: 37.6445 - val_accuracy: 0.1666 - val_loss: 2.9739
Epoch 2/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.3332 - loss: 7.0114
Epoch 2: val_loss did not improve from 2.97391
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 202ms/step - accuracy: 0.3274 - loss: 5.0060 - val_accuracy: 0.0795 - val_loss: 4.2597
Epoch 3/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step - accuracy: 0.3285 - loss: 2.7062
Epoch 3: val_loss improved from 2.97391 to 0.81911, saving model to /Volumes/XuanYi's T7/cnn-checkpoints/cnn_best_trained_model.keras
[1m638/638[0m [32m

In [12]:
# Reset test datagenerator so we can test from first batch
test_cnn_data_generator.reset()

# Get the predicted probs for each class
cnn_preds = cnn_model.predict(test_cnn_data_generator)
# Cast the prob into their labels
cnn_pred_labels = np.argmax(cnn_preds, axis=1)
cnn_true_labels = test_cnn_data_generator.classes

# Prints metrics and scores of our test so we can evaluate
print(classification_report(cnn_true_labels, cnn_pred_labels, zero_division=0))
print(confusion_matrix(cnn_true_labels, cnn_pred_labels))


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 161ms/step
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      3988
           1       0.00      0.00      0.00       701
           2       0.00      0.00      0.00       406

    accuracy                           0.78      5095
   macro avg       0.26      0.33      0.29      5095
weighted avg       0.61      0.78      0.69      5095

[[3988    0    0]
 [ 701    0    0]
 [ 406    0    0]]


# VIT Model

In [13]:
# VIT Libraries
from datasets import load_dataset
from transformers import AutoImageProcessor
from transformers import AutoModelForImageClassification
from transformers import DefaultDataCollator
from transformers import TrainingArguments
from transformers import Trainer

In [14]:
# Uses HF load_dataset and imagefolder builder to load images from vit images where each subfolder is a label
vit_dataset = load_dataset(
    "imagefolder",
    data_dir=base_dir_vit,
)

# Split into train and test
vit_dataset = vit_dataset["train"].train_test_split(test_size=0.2)
train_vit_dataset = vit_dataset["train"]
test_vit_dataset = vit_dataset["test"]

Resolving data files:   0%|          | 0/25481 [00:00<?, ?it/s]

In [23]:
# Loads a pre-trained processor who can resize, normalize, and patchify images for VIT training
vit_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

# Preprocess a batch of images into Tensors for both train and test for VIT
def preprocess_vit_tensors(batch):
    inputs = vit_processor(batch["image"], return_tensors="pt")
    batch["pixel_values"] = inputs["pixel_values"]
    return batch

# Apply to both train and test
train_vit_dataset = train_vit_dataset.map(preprocess_vit_tensors, batched=True)
test_vit_dataset = test_vit_dataset.map(preprocess_vit_tensors, batched=True)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Map:   0%|          | 0/20384 [00:00<?, ? examples/s]

Map:   0%|          | 0/5097 [00:00<?, ? examples/s]

In [24]:
# Maps class labels for beauty products to ints for training
label_to_id = {
    "problematic": 0,
    "uncertain": 1,
    "not_problematic": 2,
}
# Reverse mapping so we can get back our mappings
id_to_label = {v: k for k, v in label_to_id.items()}

# Loads a pre-trained VIT model to reduce the training necessary
vit_model = AutoModelForImageClassification.from_pretrained(
    # Takes pre-trained-model
    "google/vit-base-patch16-224",
    # Only problem, no problem and uncertain
    num_labels=3,
    # Ensure model outputs are mapped to my labels for prediction
    label2id=label_to_id,
    id2label=id_to_label,
    ignore_mismatched_sizes=True
)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Batches and formatting of data during training
vit_collator = DefaultDataCollator()

# Arguments for Training
vit_training_args = TrainingArguments(
    # Save to specified location, my external ssd as laptop no space
    output_dir=vit_model_dir,
    # Batch size for gpu and cpu
    per_device_train_batch_size=32,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    # Log progress every 50 steps
    logging_steps=50,
    report_to="none",
    # Reload the model with lowest val loss after train
    load_best_model_at_end=True,
    # Use val loss to determine best model
    metric_for_best_model="eval_loss",
)

In [None]:
# Create a trainer based on training params and actually train
trainer = Trainer(
    model=vit_model,
    args=vit_training_args,
    train_dataset=train_vit_dataset,
    eval_dataset=test_vit_dataset,
    data_collator=vit_collator,
    processing_class=vit_processor
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6262,0.645599
2,0.4348,0.665912
3,0.1345,0.946515
4,0.0134,1.39063
5,0.0004,1.520901
6,0.0002,1.623971
7,0.0001,1.692511
8,0.0001,1.727897
9,0.0,1.75314
10,0.0,1.763966




TrainOutput(global_step=6370, training_loss=0.11906840492442347, metrics={'train_runtime': 28877.1377, 'train_samples_per_second': 7.059, 'train_steps_per_second': 0.221, 'total_flos': 1.579610873729581e+19, 'train_loss': 0.11906840492442347, 'epoch': 10.0})

# Model Deployment

In [27]:
# Deploying model libraries
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForImageClassification, AutoImageProcessor
from PIL import Image
import torch

In [28]:
"""
Helper Function to flag problematic orders based on textual review and product images
"""
def flag_order(text, img):
    # Sentiment prediction
    sentiment_res = sentiment_clf(text)[0]
    sentiment_label = sentiment_res['label'].lower()
    sentiment_output = f"{sentiment_label.capitalize()} ({sentiment_res['score']:.3f})"
    
    # Image prediction - optional
    if img is not None:
        inputs = image_processor(img, return_tensors="pt")
        with torch.no_grad():
            outputs = image_model(**inputs)
            image_pred = outputs.logits.argmax(-1).item()
        image_label = id_to_label.get(image_pred, f"Class {image_pred}")
    else:
        image_label = "No image provided"
    
    # Determine if order is problematic with new logic
    if sentiment_label == "negative":
        overall_flag = "problematic"
    elif sentiment_label == "neutral" and image_label == "problematic":
        overall_flag = "problematic"
    else:
        overall_flag = "not_problematic"
    
    return sentiment_output, image_label, overall_flag

In [29]:
# Load the Best VIT and Text Models
image_checkpoint_dir = f"{vit_model_dir}/checkpoint-6370"

# Load sentiment analysis pipeline to predict sentiment of review
sentiment_tokenizer = AutoTokenizer.from_pretrained(text_model_dir)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(text_model_dir)
sentiment_clf = pipeline("text-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)

def predict_sentiment(text: str):
    res = sentiment_clf(text)[0]
    return f"{res['label']} ({res['score']:.3f})"

# Load image classification model
image_model = AutoModelForImageClassification.from_pretrained(image_checkpoint_dir)
image_processor = AutoImageProcessor.from_pretrained(image_checkpoint_dir)

def predict_image(img):
    inputs = image_processor(img, return_tensors="pt")
    with torch.no_grad():
        outputs = image_model(**inputs)
        pred = outputs.logits.argmax(-1).item()
    return f"Predicted class: {pred}"

demo = gr.Interface(
    fn=flag_order,
    inputs=[
        gr.Textbox(lines=3, placeholder="Enter a review..."),
        gr.Image(type="pil", label="Order Image (optional)")
    ],
    outputs=[
        gr.Textbox(label="Sentiment"),
        gr.Textbox(label="Image Classification"),
        gr.Textbox(label="Overall Flag")
    ],
    title="Order Review & Image Analysis",
    description="Enter a review and optionally upload an order image to get combined insights."
)

if __name__ == "__main__":
    demo.launch()

Device set to use mps:0
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Conclusions:
- Not Problematic images are diverse, while problematic and uncertain images are usually the same, i.e. we follow text review more. If text is positive, but picture is damaged box, review overall should be positive
- Similarly, images are useful when text is uncertain, then we follow