In [1]:

import os
import torch
import imageio.v3 as iio
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

def adjust_video_length(video, desired_frames):
    current_frames = video.shape[0]
    
    if current_frames < desired_frames:
        pad_frames = desired_frames - current_frames
        padding = torch.zeros((pad_frames, *video.shape[1:]), dtype=video.dtype)
        video = torch.cat((video, padding), dim=0)
    elif current_frames > desired_frames:
        video = video[:desired_frames]
    
    return video

from PIL import Image

def is_image_valid(image_path):
    try:
        img = Image.open(image_path)
        img.verify()
        return True
    except:
        return False


import os
import numpy as np
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from PIL import Image

class VideoDataset(Dataset):
    def __init__(self, directory):
        self.directory = directory
        self.video_dirs = sorted([d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))])

    def __len__(self):
        return len(self.video_dirs)

    def __getitem__(self, idx):
        video_dir = os.path.join(self.directory, self.video_dirs[idx])
        frames = []
        for i in range(22):
            image_path = os.path.join(video_dir, f"image_{i}.png")
            if is_image_valid(image_path):
              image = Image.open(image_path).convert("RGB")
              frames.append(ToTensor()(image))
        video = torch.stack(frames)
        # Adjust video length
        desired_frames = 22  
        video = adjust_video_length(video, desired_frames)
        return video

class SegmentationDataset(Dataset):
    def __init__(self, directory):
        self.directory = directory
        self.video_dirs = sorted([d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))])

    def __len__(self):
        return len(self.video_dirs)

    def __getitem__(self, idx):
        video_dir = os.path.join(self.directory, self.video_dirs[idx])
        frames = []
        for i in range(22):
            image_path = os.path.join(video_dir, f"image_{i}.png")
            if is_image_valid(image_path):
                image = Image.open(image_path).convert("RGB")
                frames.append(ToTensor()(image))

        mask_path = os.path.join(video_dir, "mask.npy")
        mask_data = np.load(mask_path)
        mask = torch.tensor(mask_data.reshape(22, 160, 240), dtype=torch.long)  # Reshape the mask data
        video = torch.stack(frames)
        # Adjust video length
        desired_frames = 22  
        video = adjust_video_length(video, desired_frames)
        return video, mask




import os
import numpy as np

def save_generated_masks(generated_masks, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for i, mask in enumerate(generated_masks):
        output_path = os.path.join(output_directory, f"mask_{i}.npy")
        np.save(output_path, mask)


def calculate_iou_batch(predictions, masks):
    smooth = 1e-6
    predictions = predictions.argmax(dim=1)
    predictions = predictions > 0.5
    masks = masks > 0.5

    intersection = (predictions & masks).float().sum()
    union = (predictions | masks).float().sum()
    iou = (intersection + smooth) / (union + smooth)
    return iou.item()




from tqdm.notebook import tqdm



In [2]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [2]:
device = torch.device("cpu")

In [3]:
num_epochs = 30

In [4]:
class PretextModel(nn.Module):
    def __init__(self):
        super(PretextModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv3d(22, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
            nn.Conv3d(32, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        )
        
        self.decoder = nn.Sequential(
            nn.Conv3d(64, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose3d(32, 32, kernel_size=(1, 2, 2), stride=(1, 2, 2)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose3d(32, 22, kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [5]:
unlabeled_dataset = VideoDataset("/scratch/dks7920/Dataset_Student/unlabeled")
labeled_train_dataset = SegmentationDataset("/scratch/dks7920/Dataset_Student/train")
labeled_val_dataset = SegmentationDataset("/scratch/dks7920/Dataset_Student/val")

In [None]:
import time
from tqdm import trange

# Self-supervised learning
pretext_model = PretextModel().to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(pretext_model.parameters(), lr=0.001)

unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=True)
t = trange(num_epochs)
for epoch in t:
    start_epoch = time.time()

    for batch_idx, videos in enumerate(unlabeled_dataloader):
        start_batch = time.time()
        videos = videos.to(device)
        optimizer.zero_grad()

        start_forward = time.time()
        predictions = pretext_model(videos)

        start_loss = time.time()
        loss = criterion(predictions, videos)  # Modify this line based on the pretext task

        start_backward = time.time()
        loss.backward()

        start_optim = time.time()
        optimizer.step()

        batch_time = time.time() - start_batch
        t.set_postfix(batch_time=batch_time, batch_idx=batch_idx + 1)

    epoch_time = time.time() - start_epoch
    t.set_postfix(epoch_time=epoch_time, epoch=epoch + 1, refresh=True)




In [None]:
from torch.utils.data import Subset
num_samples = 500  # Adjust this to the number of samples you want to use
small_unlabeled_dataset = Subset(unlabeled_dataset, range(num_samples))
small_unlabeled_dataloader = DataLoader(small_unlabeled_dataset, batch_size=32, shuffle=True)

pretext_model = PretextModel().to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(pretext_model.parameters(), lr=0.001)

from tqdm import trange
import time

t = trange(num_epochs)
for epoch in t:
    running_loss = 0.0
    num_batches = 0
    start_epoch = time.time()

    for batch_idx, videos in enumerate(small_unlabeled_dataloader):
        start_batch = time.time()
        videos = videos.to(device)
        optimizer.zero_grad()

        start_forward = time.time()
        predictions = pretext_model(videos)

        start_loss = time.time()
        loss = criterion(predictions, videos)  # Modify this line based on the pretext task

        start_backward = time.time()
        loss.backward()

        start_optim = time.time()
        optimizer.step()

        batch_time = time.time() - start_batch
        running_loss += loss.item()
        num_batches += 1
        t.set_postfix(batch_time=batch_time, batch_idx=batch_idx + 1, refresh=False)
    
    avg_train_loss = running_loss / num_batches

    epoch_time = time.time() - start_epoch
    t.set_description(f"Epoch {epoch + 1}")
    t.set_postfix(epoch_time=epoch_time, training_loss=avg_train_loss, refresh=True)




Epoch 2:   7%|▋         | 2/30 [04:35<54:46, 117.39s/it, epoch_time=22, training_loss=0.051]   

In [None]:
class SegmentationModel(nn.Module):
    def __init__(self, pretrained_model):
        super(SegmentationModel, self).__init__()
        self.base_model = pretrained_model.encoder
        self.segmentation_head = nn.Sequential(
            nn.Conv3d(64, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.Upsample(size=(22, 160, 240), mode="nearest"),
            nn.Conv3d(32, 16, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv3d(16, 52, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
        )

    def forward(self, x):
        x = self.base_model(x)
        x = self.segmentation_head(x)
        return x


In [None]:
num_samples = 200
small_train_dataset = Subset(labeled_train_dataset, range(num_samples))
small_val_dataset = Subset(labeled_val_dataset, range(num_samples))
train_dataloader = DataLoader(small_train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(small_val_dataset, batch_size=16, shuffle=False)

In [None]:
train_dataloader = DataLoader(labeled_train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(labeled_val_dataset, batch_size=16, shuffle=False)

In [None]:
# Supervised fine-tuning
segmentation_model = SegmentationModel(pretrained_model=pretext_model).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(segmentation_model.parameters(), lr=0.001)
t = trange(num_epochs, position=0, leave=True)
best_iou = 0.1492
for epoch in t:
    running_loss = 0.0
    num_batches = 0

    # Training loop
    segmentation_model.train()
    for videos, masks in train_dataloader:
        videos = videos.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        predictions = segmentation_model(videos)
        loss = criterion(predictions, masks)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_batches += 1
    avg_train_loss = running_loss / num_batches

    # Validation loop
    segmentation_model.eval()
    val_loss = 0
    iou_sum = 0
    num_batches = 0
    with torch.no_grad():
        for videos, masks in val_dataloader:
            videos = videos.to(device)
            masks = masks.to(device)
            predictions = segmentation_model(videos)
            loss = criterion(predictions, masks)
            val_loss += loss.item()
            iou_sum += calculate_iou_batch(predictions, masks)
            num_batches += 1

    # Calculate average validation loss and IoU
    avg_val_loss = val_loss / num_batches
    avg_iou = iou_sum / num_batches

    # Update the progress bar with training and validation information
    t.set_description(f"Epoch {epoch + 1}")
    t.set_postfix(
        train_loss=f"{avg_train_loss:.4f}",
        val_loss=f"{avg_val_loss:.4f}",
        iou=f"{avg_iou:.4f}",
        refresh=True,
    )
    if avg_iou > best_iou:
        best_iou = avg_iou
        torch.save(segmentation_model.state_dict(), f"best_model.pth")
        t.write(f"Best model saved at Epoch {epoch + 1} with IOU: {avg_iou:.4f}")


In [15]:
print(device)

cuda:0


In [None]:
import numpy as np

def count_classes(dataset):
    unique_classes = set()
    for _, mask in dataset:
        for frame in mask:  # Iterate through each frame of the mask
            classes = np.unique(frame.numpy())
            unique_classes.update(classes)
    return len(unique_classes)

num_classes = count_classes(labeled_train_dataset)
print(f"Number of classes: {num_classes}")


In [None]:
mask_data2 = np.load("/scratch/dks7920/Dataset_Student/train/video_301/mask.npy")
print(f"Mask data shape: {mask_data2.shape}")

In [None]:
# 1. Implementing a custom pretext task
class NextFramePredictionModel(nn.Module):
    def __init__(self):
        super(NextFramePredictionModel, self).__init__()
        # Use the same encoder architecture as in the original code
        self.encoder = nn.Sequential(
            nn.Conv3d(22, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
            nn.Conv3d(32, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        )
        
        # Use a different decoder architecture for the pretext task
        self.decoder = nn.Sequential(
            nn.Conv3d(64, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose3d(32, 32, kernel_size=(1, 2, 2), stride=(1, 2, 2)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose3d(32, 3, kernel_size=(1, 2, 2), stride=(1, 2, 2)),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [None]:


# Replace the pretext model with the new model
pretext_model = NextFramePredictionModel().to(device)
criterion = nn.MSELoss().to(device)
from tqdm import trange

t = trange(num_epochs)
for epoch in t:
    running_loss = 0.0
    num_batches = 0
    pretext_model.train()
    for videos in small_unlabeled_dataloader:
        videos = videos.to(device)
        optimizer.zero_grad()
        input_videos = videos[:, :-1]
        target_videos = videos[:, -1]
        predictions = pretext_model(input_videos)
        loss = criterion(predictions, target_videos)
        loss.backward()
        optimizer.step()
        batch_time = time.time() - start_batch
        running_loss += loss.item()
        num_batches += 1
        t.set_postfix(batch_time=batch_time, batch_idx=batch_idx + 1, refresh=False)
    
    avg_train_loss = running_loss / num_batches

    epoch_time = time.time() - start_epoch
    t.set_description(f"Epoch {epoch + 1}")
    t.set_postfix(epoch_time=epoch_time, training_loss=avg_train_loss, refresh=True)

# 3. Semi-supervised learning - pseudo-labeling
# Generate pseudo-labels for the unlabeled data
#unlabeled_dataset = VideoDataset("/scratch/dks7920/Dataset_Student/unlabeled")
#unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=False)
segmentation_model.eval()
pseudo_labels = []
with torch.no_grad():
    for videos in small_unlabeled_dataloader:
        videos = videos.to(device)
        predictions = segmentation_model(videos[:, :11]) # Use first 11 frames
        pseudo_labels.append(predictions.argmax(dim=1).cpu())
        
combined_dataset = ConcatDataset([labeled_train_dataset] + [TensorDataset(videos, masks) for videos, masks in zip(unlabeled_dataset, pseudo_labels)])
combined_dataloader = DataLoader(combined_dataset, batch_size=16, shuffle=True)

for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    num_batches = 0

    # Training loop
    segmentation_model.train()
    for videos, masks in train_dataloader:
        videos = videos.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        predictions = segmentation_model(videos)
        loss = criterion(predictions, masks)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_batches += 1
    avg_train_loss = running_loss / num_batches

    # Validation loop
    segmentation_model.eval()
    val_loss = 0
    iou_sum = 0
    num_batches = 0
    with torch.no_grad():
        for videos, masks in val_dataloader:
            videos = videos.to(device)
            masks = masks.to(device)
            predictions = segmentation_model(videos)
            loss = criterion(predictions, masks)
            val_loss += loss.item()
            iou_sum += calculate_iou_batch(predictions, masks)
            num_batches += 1

    # Calculate average validation loss and IoU
    avg_val_loss = val_loss / num_batches
    avg_iou = iou_sum / num_batches

    # Update the progress bar with training and validation information
    t.set_description(f"Epoch {epoch + 1}")
    t.set_postfix(
        train_loss=f"{avg_train_loss:.4f}",
        val_loss=f"{avg_val_loss:.4f}",
        iou=f"{avg_iou:.4f}",
        refresh=True,
    )
    if avg_iou > best_iou:
        best_iou = avg_iou
        torch.save(segmentation_model.state_dict(), f"best_model.pth")
        t.write(f"Best model saved at Epoch {epoch + 1} with IOU: {avg_iou:.4f}")
