<a href="https://colab.research.google.com/github/RowanCK/Garbage-Classification-Model---Group-16/blob/main/multimodal_garbage_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Garbage Classification Using Multimodal Deep Learning

##### Team Members: Rowan (Yi-Kai) Chen, Das (Shih Ting) Tai, Ryan Lau, Zain Jelani
#####Group Number: 16

In [None]:
import os
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import glob
from PIL import Image
from torchvision.datasets import ImageFolder
import torchvision
from torchvision import transforms, models
from torchvision.models import resnet18
from transformers import DistilBertModel, DistilBertTokenizer
# from torchvision.models import ResNet50_Weights
# from transformers import BertTokenizer, BertModel

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Dataset directories
# change these to your own directories where your data is located
TRAINSET_DIR = '/content/drive/MyDrive/ENSF 617 - Introduction of Machine Learning /Assignment 2/garbage_sampled/garbage_sampled/CVPR_2024_dataset_Train'
VALSET_DIR = '/content/drive/MyDrive/ENSF 617 - Introduction of Machine Learning /Assignment 2/garbage_sampled/garbage_sampled/CVPR_2024_dataset_Val'
TESTSET_DIR = '/content/drive/MyDrive/ENSF 617 - Introduction of Machine Learning /Assignment 2/garbage_sampled/garbage_sampled/CVPR_2024_dataset_Test'

# Class Definitions

## Dataset class
- Data Preprocessing
  - Image Loading and Transformation:
    - Resize images to 224x224 pixels to fit the model input
    - Set Data Augmentation (Flips) to help the model generalize better
    - Convert images to Tensors (Encoding images to numerical vectors)
    - Normalize pixel values using ImageNet standards for stable training
  - Text Preprocessing
- Load Datasets

In [None]:
# Define the dataset class
# 'ImageFolder' - automatically maps subfolder names to class labels e.g., Green/xxx.png => Green; inherits from torch.utils.data.Dataset
class MultiModalGarbageDataset(ImageFolder):
  def __init__(self, root, tokenizer, max_len, transforms=None):
        super().__init__(root, transform=transforms)
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, idx):
        # 1. use ImageFolder to get img's path & label
        path, label = self.samples[idx]

        # 2. Image
        image = self.loader(path)
        if self.transform is not None:
            image = self.transform(image)

        # 3. Text
        file_name = os.path.basename(path)
        file_name_no_ext, _ = os.path.splitext(file_name)
        # remove '_' and number
        text = file_name_no_ext.replace('_', ' ')
        text = re.sub(r'\d+', '', text)
        # Text Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'image': image, # Image Tensor
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
            'text': text,
        }

## Model Class

Our model integrates two branches:
1. **Image Model**: Extracts features from garbage images using a pre-trained convolutional neural network (ResNet18).
2. **Text Model**: Encodes textual descriptions using DistilBert embeddings.

Both branches' outputs are combined in a fusion layer to produce the final classification prediction.

### Components:
- **Image Branch**: Extracts visual features from garbage images.
- **Text Branch**: Encodes descriptions into meaningful embeddings.
- **Fusion Layer**: Combines image and text features to generate a multimodal representation.

# Function Definitions

### Traning Function

### Evaluation Function

### Prediction Function

# Main Functions

## Pre-process data, define criterion, and initialize the multimodal model


In [None]:
# Deifne Image Transformations
# Train/Val transformations (With Augmentation)
torchvision_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),  # randomly flip images left/right
    transforms.RandomVerticalFlip(),    # randomly flip images up/down
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Test Transformations (No Augmentation)
torchvision_transform_test = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load Dataset
train_dataset = MultiModalGarbageDataset(TRAIN_PATH, tokenizer, max_len=24, transform=torchvision_transform)
val_dataset   = MultiModalGarbageDataset(VAL_PATH,   tokenizer, max_len=24, transform=torchvision_transform)
test_dataset  = MultiModalGarbageDataset(TEST_PATH,  tokenizer, max_len=24, transform=torchvision_transform_test)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class_names = train_dataset.classes
print(class_names)

## Transfer learning

## Set the main hyperparameters
- batch size
- learning rate
- number of epochs



In [8]:
# Hyperparameters
BATCH_SIZE = 2
LEARNING_RATE = 0.0001

## Train the model

## Load best model for testing

## Test the model
- Run prediction on your test set
- Extract relevant metrics
- Measure inference time