<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/Model_Attempt5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving all_images.zip to all_images.zip
Saving final_dataset_paths.csv to final_dataset_paths.csv


Setup & Install Dependencies

In [3]:
!pip install iterative-stratification

import os
import zipfile
import re
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torch.optim.lr_scheduler import ReduceLROnPlateau
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


Unzip Images

In [5]:
zip_path = 'all_images.zip'
extract_dir = 'images/'

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


Load CSV & Prepare Labels

In [7]:
csv_path = 'final_dataset_paths.csv'
df = pd.read_csv(csv_path)

# Example: CSV columns: image,labels (comma-separated multi-labels)
all_labels = sorted({label for labels in df['labels'] for label in labels.split(',')})
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}

def encode_labels(label_string):
    vec = [0]*len(all_labels)
    for lbl in label_string.split(','):
        vec[label_to_idx[lbl]] = 1
    return vec

df['multi_hot'] = df['labels'].apply(encode_labels)

# Extract base image name to group augmented variants
def get_base_image_name(filename):
    match = re.match(r"(img\(\d+\))", filename)
    return match.group(1) if match else filename

df['group'] = df['image_file'].apply(get_base_image_name)


Stratified Split

In [9]:
X = df['image_file'].values
y = np.vstack(df['multi_hot'].values)

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in msss.split(X, y):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]


Aggressive Data Augmentation

In [10]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.1,0.1), scale=(0.8,1.2)),
    transforms.ToTensor(),
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])


Custom Dataset

In [16]:
class MultiLabelDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx]['image_file'])
        image = Image.open(img_path).convert('RGB')
        label = torch.tensor(self.df.iloc[idx]['multi_hot'], dtype=torch.float32)
        if self.transform:
            image = self.transform(image)
        return image, label

train_dataset = MultiLabelDataset(train_df, extract_dir, transform=train_transform)
val_dataset = MultiLabelDataset(val_df, extract_dir, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


Model Setup (MobileNetV3-small)

In [17]:
num_classes = len(all_labels)

model = models.mobilenet_v3_small(pretrained=True)
for param in model.features.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(
    nn.Linear(model.classifier[0].in_features, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, num_classes),
    nn.Sigmoid()  # multi-label
)
model = model.to(device)




Loss, Optimizer, Scheduler

In [18]:
criterion = nn.BCELoss()  # multi-label
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


Training Loop with Early Stopping

In [19]:
num_epochs = 30
best_val_loss = float('inf')
patience = 5
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
    train_loss /= len(train_loader.dataset)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered!")
            break


Epoch 1, Train Loss: 0.1574, Val Loss: 0.0794
Epoch 2, Train Loss: 0.0792, Val Loss: 0.0776
Epoch 3, Train Loss: 0.0699, Val Loss: 0.0660
Epoch 4, Train Loss: 0.0663, Val Loss: 0.0616
Epoch 5, Train Loss: 0.0643, Val Loss: 0.0603
Epoch 6, Train Loss: 0.0623, Val Loss: 0.0571
Epoch 7, Train Loss: 0.0604, Val Loss: 0.0558
Epoch 8, Train Loss: 0.0594, Val Loss: 0.0544
Epoch 9, Train Loss: 0.0575, Val Loss: 0.0519
Epoch 10, Train Loss: 0.0561, Val Loss: 0.0508
Epoch 11, Train Loss: 0.0552, Val Loss: 0.0491
Epoch 12, Train Loss: 0.0536, Val Loss: 0.0506
Epoch 13, Train Loss: 0.0527, Val Loss: 0.0476
Epoch 14, Train Loss: 0.0517, Val Loss: 0.0461
Epoch 15, Train Loss: 0.0525, Val Loss: 0.0450
Epoch 16, Train Loss: 0.0518, Val Loss: 0.0458
Epoch 17, Train Loss: 0.0504, Val Loss: 0.0449
Epoch 18, Train Loss: 0.0501, Val Loss: 0.0442
Epoch 19, Train Loss: 0.0488, Val Loss: 0.0442
Epoch 20, Train Loss: 0.0488, Val Loss: 0.0430
Epoch 21, Train Loss: 0.0479, Val Loss: 0.0416
Epoch 22, Train Loss: 

In [20]:
from sklearn.metrics import classification_report
import torch

In [21]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        preds = (outputs > 0.5).float()  # threshold for multi-label
        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds, dim=0).numpy()
all_labels = torch.cat(all_labels, dim=0).numpy()


In [25]:
# This should be the sorted list of dish names you built when preparing labels
class_names = list(label_to_idx.keys())  # or your original all_labels list


In [26]:
from sklearn.metrics import classification_report

print(classification_report(all_labels, all_preds, target_names=class_names, zero_division=0))


                   precision    recall  f1-score   support

         'Rice'']       0.00      0.00      0.00         1
      'Bishamel']       0.00      0.00      0.00         1
          'Bread'       0.00      0.00      0.00         2
         'Bread']       0.00      0.00      0.00         2
        'Chicken'       0.00      0.00      0.00         2
       'Chicken']       0.70      0.37      0.48        19
          'Fish']       0.00      0.00      0.00         1
       'Jareesh']       0.00      0.00      0.00         1
     'Meat Soup']       0.00      0.00      0.00         1
           'Meat'       0.00      0.00      0.00         6
          'Meat']       0.00      0.00      0.00        10
    'Mohalabiya']       1.00      1.00      1.00         2
         'Pasta']       0.00      0.00      0.00         2
        'Potato']       0.00      0.00      0.00         2
         'Qursan'       0.00      0.00      0.00         2
        'Qursan']       0.00      0.00      0.00       

In [28]:
num_samples = 40
for i in range(num_samples):
    true_idx = np.where(all_labels[i]==1)[0]
    pred_idx = np.where(all_preds[i]==1)[0]

    true_names = [class_names[j] for j in true_idx]
    pred_names = [class_names[j] for j in pred_idx]

    print(f"Sample {i+1}:")
    print(f"  True Labels: {true_names}")
    print(f"  Pred Labels: {pred_names}")
    print('-'*50)


Sample 1:
  True Labels: ["['Shakshuka']"]
  Pred Labels: []
--------------------------------------------------
Sample 2:
  True Labels: [" 'Rice']", "['Chicken'"]
  Pred Labels: []
--------------------------------------------------
Sample 3:
  True Labels: [" 'Rice']", "['Chicken'"]
  Pred Labels: []
--------------------------------------------------
Sample 4:
  True Labels: ["['Soup']"]
  Pred Labels: []
--------------------------------------------------
Sample 5:
  True Labels: ["['Saleeg']"]
  Pred Labels: []
--------------------------------------------------
Sample 6:
  True Labels: ["['Luqaimat']"]
  Pred Labels: ["['Luqaimat']"]
--------------------------------------------------
Sample 7:
  True Labels: [" 'Soup']", "['Meat'"]
  Pred Labels: []
--------------------------------------------------
Sample 8:
  True Labels: ["['Foul']"]
  Pred Labels: []
--------------------------------------------------
Sample 9:
  True Labels: ["['Lahm Bi Ajeen']"]
  Pred Labels: []
---------------