In [6]:

import torch
import torchvision
from torchvision import models, datasets, transforms
from torchvision.models.segmentation.deeplabv3 import DeepLabHead
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import numpy as np
import time
import json
# import deeplabv3_resnet50
from torchvision.models.segmentation import deeplabv3_resnet50


with open('config.json') as config_file:
    config = json.load(config_file)

# Use the values from the configuration file
dataset_path = config['data_path']
num_epochs = config['num_epochs']
save_dir = config['save_dir']
continue_training = config['continue_training']

num_classes = 20
model = deeplabv3_resnet50(weights=None, num_classes=20, aux_loss=True)
# Number of effective classes after mapping (19 classes + 1 background)

# Replace the classifier of the model

# Mapping for reducing classes to 20 including background
mapping_20 = {
    0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 2, 9: 0,
    10: 0, 11: 3, 12: 4, 13: 5, 14: 0, 15: 0, 16: 0, 17: 6, 18: 0,
    19: 7, 20: 8, 21: 9, 22: 10, 23: 11, 24: 12, 25: 13, 26: 14,
    27: 15, 28: 16, 29: 0, 30: 0, 31: 17, 32: 18, 33: 19, -1: 0
}

def encode_labels(mask):
    label_mask = np.zeros_like(mask)
    for k in mapping_20:
        label_mask[mask == k] = mapping_20[k]
    return label_mask

def transform_target(target):
    target = np.array(target)  # Convert PIL Image to numpy array
    target = encode_labels(target)  # Remap labels
    return torch.as_tensor(target, dtype=torch.int64)  # Convert numpy array to tensor

target_transform = transforms.Compose([
    transform_target
])

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define the dataset with appropriate transforms for both images and targets
train_dataset = datasets.Cityscapes(root=dataset_path, split='train', mode='fine', target_type='semantic',
                                    transform=transform, target_transform=target_transform)
val_dataset = datasets.Cityscapes(root=dataset_path, split='val', mode='fine', target_type='semantic',
                                  transform=transform, target_transform=target_transform)
# test_dataset = datasets.Cityscapes(root=dataset_path, split='test', mode='fine', target_type='semantic', transform=transform, target_transform=target_transform)

# batch size should be set to 4 or more on GPU for training
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, drop_last=True)
# test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("training on", device)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()

max_iter = num_epochs * len(train_loader)
learning_rate = 0.001  # Initial learning rate
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=7, gamma=0.1) # learning rate decay，reduce the learning rate by a factor of 0.1 every 7 epochs

if continue_training:
    model.load_state_dict(torch.load(save_dir +'/best_model_weights.pth'))

# Training loop
train_loss_list = [] # total loss
val_loss_list = [] # total # loss
best_val_loss = float('inf')

training on cpu


In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
print_trainable_parameters(model)

trainable params: 42003560 || all params: 42003560 || trainable%: 100.00


In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

ValueError: Target modules {'query', 'value'} not found in the base model. Please check the target modules and try again.

In [13]:

model_checkpoint = "google/vit-base-patch16-224-in21k"  # pre-trained model from which to fine-tune

from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)
print_trainable_parameters(model)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 85800194 || all params: 85800194 || trainable%: 100.00


In [14]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 591362 || all params: 86391556 || trainable%: 0.68


In [15]:
for name, module in model.named_modules():
    print(name)


vit
vit.embeddings
vit.embeddings.patch_embeddings
vit.embeddings.patch_embeddings.projection
vit.embeddings.dropout
vit.encoder
vit.encoder.layer
vit.encoder.layer.0
vit.encoder.layer.0.attention
vit.encoder.layer.0.attention.attention
vit.encoder.layer.0.attention.attention.query
vit.encoder.layer.0.attention.attention.query.base_layer
vit.encoder.layer.0.attention.attention.query.lora_dropout
vit.encoder.layer.0.attention.attention.query.lora_dropout.default
vit.encoder.layer.0.attention.attention.query.lora_A
vit.encoder.layer.0.attention.attention.query.lora_A.default
vit.encoder.layer.0.attention.attention.query.lora_B
vit.encoder.layer.0.attention.attention.query.lora_B.default
vit.encoder.layer.0.attention.attention.query.lora_embedding_A
vit.encoder.layer.0.attention.attention.query.lora_embedding_B
vit.encoder.layer.0.attention.attention.key
vit.encoder.layer.0.attention.attention.value
vit.encoder.layer.0.attention.attention.value.base_layer
vit.encoder.layer.0.attention.at

In [16]:
import torch
import torchvision
from torchvision import models, datasets, transforms
from torchvision.models.segmentation.deeplabv3 import DeepLabHead
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import numpy as np
import time
import json
# import deeplabv3_resnet50
from torchvision.models.segmentation import deeplabv3_resnet50
from transformers import ViTModel, ViTConfig
from peft import LoraConfig, get_peft_model

# Load the pretrained Vision Transformer
model_checkpoint = "google/vit-base-patch16-224-in21k"
config = ViTConfig.from_pretrained(model_checkpoint)
vit_model = ViTModel.from_pretrained(model_checkpoint, add_pooling_layer=False)  # No pooling to maintain spatial dimensions

with open('config.json') as config_file:
    config = json.load(config_file)

# Use the values from the configuration file
dataset_path = config['data_path']
num_epochs = config['num_epochs']
save_dir = config['save_dir']
continue_training = config['continue_training']

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(vit_model, lora_config)


class ViTBackbone(torch.nn.Module):
    def __init__(self, vit_model):
        super().__init__()
        self.vit = vit_model

    def forward(self, x):
        outputs = self.vit(x)
        # Assume the output is a tuple (last_hidden_state, ...)
        return outputs.last_hidden_state  # Returning the feature map


# Replace the backbone in DeepLabV3
model = deeplabv3_resnet50(weights=None, num_classes=20, aux_loss=True)
model.backbone = ViTBackbone(lora_model)
# Assuming ViT outputs 768-dimensional features
model.classifier = DeepLabHead(768, 20)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
print_trainable_parameters(lora_model)
print_trainable_parameters(model)

trainable params: 589824 || all params: 86388480 || trainable%: 0.68
trainable params: 9582632 || all params: 95381288 || trainable%: 10.05
