In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.9 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.9 MB 882.6 kB/s eta 0:00:11
   -- ------------------------------------- 0.5/9.9 MB 882.6 kB/s eta 0:00:11
   ---- ----------------------------------- 1.0/9.9 MB 1.2 MB/s eta 0:00:08
   ------- -------------------------------- 1.8/9.9 MB 1.7 MB/s eta 0:00:05
   --------- ------------------------------ 2.4/9.9 MB 1.8 MB/s eta 0:00:05
   ---------- ----------------------------- 2.6/9.9 MB 1.8 MB/s eta 0:00:04
 

In [4]:
from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

# Define number of classes for your segmentation task (example: 3 classes)
num_classes = 40

# Load a pretrained SegFormer model (B0 version is a good starting point)
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512", num_labels=num_classes, ignore_mismatched_sizes=True)

# Load feature extractor to process input images
feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([40]) in the model instantiated
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([40, 256, 1, 1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


In [9]:
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import os

class SegmentationDataset(Dataset):
    def __init__(self, image_folder, label_folder, transform=None, feature_extractor=None):
        self.image_folder = image_folder
        self.label_folder = label_folder
        self.image_files = []
        self.feature_extractor = feature_extractor  # Add feature extractor
        
        for batch_folder in os.listdir(image_folder):
            batch_path = os.path.join(image_folder, batch_folder)
            if os.path.isdir(batch_path):
                self.image_files += [os.path.join(batch_path, f) for f in os.listdir(batch_path) if f.endswith('_leftImg8bit.jpg')]
        
        if not self.image_files:
            raise ValueError("No images found in the image folder!")

        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        img_name = os.path.basename(img_path)
        batch_name = os.path.basename(os.path.dirname(img_path))

        label_path = os.path.join(self.label_folder, batch_name, f"{img_name.split('_leftImg8bit')[0]}_gtFine_labelColors.png")
        image = Image.open(img_path).convert("RGB")
        label = Image.open(label_path).convert("L")  # Convert label to grayscale

        if self.transform:
            image = self.transform(image)
            label = self.transform(label)

        # Apply feature extractor
        inputs = self.feature_extractor(images=image, return_tensors="pt")
        
        return inputs['pixel_values'].squeeze(), torch.tensor(label)

# Update transform to match model input requirements
transform = transforms.Compose([
    transforms.Resize((512, 512)),  # SegFormer expects 512x512 input
    transforms.ToTensor()
])

train_folder = 'D:/New folder/Inter_Bootcamp/dataset/train'
labels_folder = 'D:/New folder/Inter_Bootcamp/dataset/labels'

# Create dataset and dataloader
dataset = SegmentationDataset(train_folder, labels_folder, transform=transform, feature_extractor=feature_extractor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [16]:
!pip install tqdm



In [17]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os
import json
import segmentation_models_pytorch as smp
import torchmetrics.functional as tmf
from torchmetrics import JaccardIndex 
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split


# Optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()  # Suitable for multi-class segmentation tasks

# Fine-tuning loop
from tqdm import tqdm
import torch.nn.functional as F

# Training loop with tqdm for progress bar
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Wrap the dataloader with tqdm for the progress bar
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        # Remove channel dimension from labels (squeeze to make shape [batch_size, height, width])
        labels = labels.squeeze(1).long()

        # Forward pass
        outputs = model(images).logits  # Outputs shape: (batch_size, num_classes, 128, 128)

        # Upsample the outputs to match target size (batch_size, num_classes, 512, 512)
        outputs = F.interpolate(outputs, size=labels.shape[1:], mode='bilinear', align_corners=False)

        # Compute loss
        loss = criterion(outputs, labels)

        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar with current loss
        progress_bar.set_postfix(loss=total_loss / len(progress_bar))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")


  return inputs['pixel_values'].squeeze(), torch.tensor(label)
Epoch 1/10:   2%|▏         | 34/1759 [00:42<36:20,  1.26s/batch, loss=0.0423]  


KeyboardInterrupt: 