In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super(PatchEmbedding, self).__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.embed_dim = embed_dim

        self.patch_embed = nn.Conv2d(
            in_channels=self.in_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size
        )

    def forward(self, x):
        x = self.patch_embed(x)
        x = x.flatten(2).transpose(1, 2)
        return x


class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_dim, dropout_rate):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim

        self.attention = nn.MultiheadAttention(embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=dropout_rate)
        self.feed_forward = nn.Sequential(
            nn.Linear(self.embed_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.embed_dim)
        )
        self.norm1 = nn.LayerNorm(self.embed_dim)
        self.norm2 = nn.LayerNorm(self.embed_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        attention_out = self.attention(x, x, x)[0]
        x = x + self.dropout(attention_out)
        x = self.norm1(x)

        feed_forward_out = self.feed_forward(x)
        x = x + self.dropout(feed_forward_out)
        x = self.norm2(x)
        return x


class VisionTransformer(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim, num_heads, hidden_dim, num_layers, num_classes, dropout_rate):
        super(VisionTransformer, self).__init__()
        self.patch_embedding = PatchEmbedding(img_size=img_size, patch_size=patch_size, in_channels=in_channels,
                                              embed_dim=embed_dim)

        self.transformer_encoder = nn.ModuleList([
            TransformerEncoder(embed_dim=embed_dim, num_heads=num_heads, hidden_dim=hidden_dim,
                               dropout_rate=dropout_rate) for _ in range(num_layers)
        ])

        self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.position_embedding = nn.Parameter(torch.zeros(1, (img_size // patch_size) ** 2 + 1, embed_dim))
        self.dropout = nn.Dropout(dropout_rate)
        self.segmentation_head = nn.Conv2d(embed_dim, num_classes, kernel_size=1)

    def forward(self, x):
        x = self.patch_embedding(x)

        class_token = self.class_token.expand(x.shape[0], -1, -1)
        x = torch.cat((class_token, x), dim=1)
        x = x + self.position_embedding

        for encoder in self.transformer_encoder:
            x = encoder(x)

        x = self.dropout(x[:, 0])
        x = x.view(x.size(0), -1, int(x.size(-1) ** 0.5), int(x.size(-1) ** 0.5))
        x = self.segmentation_head(x)

        return x


In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from Data import CityscapesDataset


# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
num_classes = 20
batch_size = 16
num_epochs = 10
learning_rate = 0.001

# Dataset and DataLoader
dataset_root = "/home/mys/ENPM_673/FInal_Project/Dataset"
train_dataset = CityscapesDataset(root_dir=dataset_root, split="train", transform=ToTensor())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model
model = VisionTransformer(
    img_size=512,
    patch_size=16,
    in_channels=3,
    embed_dim=768,
    num_heads=12,
    hidden_dim=3072,
    num_layers=12,
    num_classes=num_classes,
    dropout_rate=0.1
).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}")

# Save the trained model
torch.save(model.state_dict(), "vision_transformer_segmentation.pth")


FileNotFoundError: [Errno 2] No such file or directory: '/home/mys/ENPM_673/FInal_Project/Dataset/train/images'