In [1]:
# Cell 1: Import necessary libraries
import os
import time
import numpy as np
from datetime import timedelta
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.segmentation import deeplabv3_mobilenet_v3_large
from torch.optim import Adam

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: mps


In [2]:
class DefectDataset(Dataset):
    def __init__(self, root_dir, transform=None, mask_transform=None, split='train'):
        self.root_dir = os.path.join(root_dir, split)
        self.transform = transform
        self.mask_transform = mask_transform or transforms.Compose([
            transforms.Resize((384, 384)),
            transforms.ToTensor()
        ])
        self.image_dir = os.path.join(self.root_dir, 'Img.after.melting')
        self.class_dirs = [f'Defect_class{cls}' for cls in [0, 5, 8, 9, 10, 11]]
        self.image_names = sorted(os.listdir(self.image_dir))

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = self.image_names[idx]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        # Initialize empty mask
        mask = np.zeros((image.size[1], image.size[0]), dtype=np.uint8)

        # Combine all defect masks
        for i, class_dir in enumerate(self.class_dirs):
            mask_path = os.path.join(self.root_dir, class_dir, img_name)
            if os.path.exists(mask_path):
                class_mask = np.array(Image.open(mask_path))
                mask[class_mask > 0] = i + 1  # class 0 becomes 1, etc.

        # Apply transforms
        if self.transform:
            image = self.transform(image)
        mask = self.mask_transform(Image.fromarray(mask)).squeeze(0).long()

        return image, mask

In [3]:
import torch.nn as nn
from torchvision.models.segmentation import deeplabv3_mobilenet_v3_large

def create_model(num_classes, backbone='mobilenetv3'):
    if backbone == 'mobilenetv3':
        # Load the pre-trained model
        model = deeplabv3_mobilenet_v3_large(pretrained=True)

        # Identify the classifier
        classifier = model.classifier

        # Find the last Conv2d layer and replace it
        last_conv_index = -1
        for i, layer in enumerate(classifier):
            if isinstance(layer, nn.Conv2d):
                last_conv_index = i

        if last_conv_index != -1:
            final_conv = classifier[last_conv_index]
            in_channels = final_conv.in_channels
            new_conv = nn.Conv2d(in_channels, num_classes, kernel_size=final_conv.kernel_size, stride=final_conv.stride, padding=final_conv.padding)
            classifier[last_conv_index] = new_conv
        else:
            print("Warning: Could not find a Conv2d layer in the classifier.")
    else:
        raise ValueError(f"Invalid backbone: {backbone}. Only 'mobilenetv3' is supported in this version.")
    return model.to(device)

In [6]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, accumulation_steps=1):
    best_val_loss = float('inf')
    start_time = time.time()

    for epoch in range(num_epochs):
        epoch_start = time.time()
        model.train()
        train_loss = 0.0
        optimizer.zero_grad()

        for batch_idx, (images, masks) in enumerate(train_loader):
            batch_start = time.time()

            images = images.to(device)
            masks = masks.to(device)

            # Forward pass
            outputs = model(images)['out']
            loss = criterion(outputs, masks)
            loss = loss / accumulation_steps
            loss.backward()

            if (batch_idx + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            train_loss += loss.item() * images.size(0) * accumulation_steps

            # Print progress
            if batch_idx % 10 == 0:
                batch_time = time.time() - batch_start
                remaining = (len(train_loader) - batch_idx) * batch_time
                print(f'\rEpoch {epoch+1}/{num_epochs} | Batch {batch_idx}/{len(train_loader)} | '
                      f'ETA: {timedelta(seconds=int(remaining))}', end='')

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, masks in val_loader:
                images = images.to(device)
                masks = masks.to(device)
                outputs = model(images)['out']
                val_loss += criterion(outputs, masks).item() * images.size(0)

        # Calculate metrics
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        epoch_time = time.time() - epoch_start
        total_remaining = (num_epochs - epoch - 1) * epoch_time

        print(f'\rEpoch {epoch+1}/{num_epochs} | '
              f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | '
              f'Time: {timedelta(seconds=int(epoch_time))} | '
              f'Total ETA: {timedelta(seconds=int(total_remaining))}')

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_deeplabv3_defect.pth')

    print(f'\nTraining completed in {timedelta(seconds=int(time.time()-start_time))}')
    return model

In [4]:
# Cell 5: Set hyperparameters and create datasets and dataloaders
dataloaders
batch_size = 4
learning_rate = 0.001
num_epochs = 20
num_classes = 7  # 6 defect classes + background
accumulation_steps = 2 # Experiment with this value

# Transforms
transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor()
])

root_dir = '/Users/sanjanahaldar/Library/CloudStorage/GoogleDrive-sanukadam721@gmail.com/My Drive/Info_Project/Defect_Detection/DataSets/Data.Splitting/After_Melting_Defect_Detection'
train_dataset = DefectDataset(root_dir=root_dir,
                              transform=transform,
                              mask_transform=mask_transform,
                              split='train')
val_dataset = DefectDataset(root_dir=root_dir,
                            transform=transform,
                            mask_transform=mask_transform,
                            split='val')

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)


Using 4 workers for data loading.
Using gradient accumulation with 4 steps.


In [5]:
# Cell 6: Inspect the model architecture
num_classes = 7
print(f"Creating model with num_classes: {num_classes}")
model = create_model(num_classes, backbone='mobilenetv3')

if hasattr(model, 'classifier') and isinstance(model.classifier, nn.Sequential):
    print("\nClassifier Structure:")
    for i, layer in enumerate(model.classifier):
        print(f"  [{i}]: {layer}")
else:
    print("Model does not have the expected 'classifier' as an nn.Sequential.")

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# DO NOT RUN THE TRAINING LOOP (Cell 4) YET.
# Just let this cell print the classifier structure.
 

Creating model with num_classes: 7




In [7]:
trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, accumulation_steps)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/deep_learning/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
  File "/opt/anaconda3/envs/deep_learning/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/deep_learning/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/deep_learning/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'DefectDataset' on <module '__main__' (built-in)>
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'DefectDataset' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt

RuntimeError: DataLoader worker (pid(s) 59336, 59338) exited unexpectedly