In [1]:
%pip install torch torchvision transformers diffusers datasets accelerate
#!unzip dataset.zip



In [2]:
from transformers import AutoImageProcessor, Dinov2ForImageClassification
from diffusers import DiffusionPipeline
import torch
from diffusers import DPMSolverMultistepScheduler
import requests
import torchvision.transforms as transforms
from PIL import Image
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
from torch import nn
from torchsummary import summary

# discriminator_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small")
# discriminator_model = Dinov2ForImageClassification.from_pretrained("facebook/dinov2-small")

# discriminator_model.classifier = torch.nn.Sequential(
#     torch.nn.Flatten(),  # Flatten the output
#     torch.nn.Linear(768, 256),  # Adjust the input features to match flattened output
#     torch.nn.ReLU(),
#     #torch.nn.functional.Normalize(),  # Batch normalization layer
#     #torch.nn.BatchNorm1d(256),
#     torch.nn.LayerNorm(256),
#     torch.nn.Linear(256, 1),
#     # torch.nn.Hardtanh()
#     torch.nn.Sigmoid()
# )

# custom weights initialization per il generatore e discriminatore
# dall'articolo oroginale DGGAN "all model weights shall be randomly initialized from a Normal distribution with mean=0, stdev=0.02"
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            # input is (nc) x 28 x 28
            nn.Conv2d(3, 768, 2, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 15 x 15
            nn.Conv2d(768, 768 * 2, 2, 2, 1, bias=False),
            nn.BatchNorm2d(768 * 2),
            nn.LeakyReLU(0.2, inplace=True),

            # state size. (ndf*2) x 8 x 8
            nn.Conv2d(768 * 2, 128 * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128 * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 4 x 4
            nn.Conv2d(128 * 4, 1, 96, 1, 0, bias=False),
            nn.Sigmoid()
        )
    def forward(self, input):
        return self.main(input)


discriminator_model = Discriminator().to(device)
discriminator_model.apply(weights_init)

summary(discriminator_model, (3, 768, 768))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 768, 385, 385]           9,216
         LeakyReLU-2        [-1, 768, 385, 385]               0
            Conv2d-3       [-1, 1536, 193, 193]       4,718,592
       BatchNorm2d-4       [-1, 1536, 193, 193]           3,072
         LeakyReLU-5       [-1, 1536, 193, 193]               0
            Conv2d-6          [-1, 512, 96, 96]      12,582,912
       BatchNorm2d-7          [-1, 512, 96, 96]           1,024
         LeakyReLU-8          [-1, 512, 96, 96]               0
            Conv2d-9              [-1, 1, 1, 1]       4,718,592
          Sigmoid-10              [-1, 1, 1, 1]               0
Total params: 22,033,408
Trainable params: 22,033,408
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 6.75
Forward/backward pass size (MB): 3154.55
Params size (MB): 84.05
E

In [5]:
teacher_model = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16)
teacher_model.scheduler = DPMSolverMultistepScheduler.from_config(teacher_model.scheduler.config)
#teacher_model.safety_checker = lambda image : False

student_model = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",safety_checker=None, torch_dtype=torch.float16)
student_model.scheduler = DPMSolverMultistepScheduler.from_config(student_model.scheduler.config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


In [6]:
# Move models to the chosen device

# discriminator_model = discriminator_model.to(device)
student_model = student_model.to(device)
teacher_model = teacher_model.to(device)
print(discriminator_model)


Discriminator(
  (main): Sequential(
    (0): Conv2d(3, 768, kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv2d(768, 1536, kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv2d(1536, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv2d(512, 1, kernel_size=(96, 96), stride=(1, 1), bias=False)
    (9): Sigmoid()
  )
)


In [7]:
# Set up the loss functions for the student model

# def adversarial_loss_function(y_pred, y_true):
#     return torch.nn.functional.relu(1 - y_pred * y_true)

# def student_loss_function(y_pred, y_true):
#     return - y_pred * y_true

# # Hinge loss for adversarial loss
# adversarial_loss_function.requires_gradient = True

# MSE loss for distillation loss

adversarial_loss_function = torch.nn.BCELoss()

distillation_loss_function = torch.nn.BCELoss()
distillation_loss_function.requires_gradient = True

# Set up the optimizer for the student model
student_optimizer = torch.optim.Adam(student_model.unet.parameters(), lr=0.1)

# Assuming you have a discriminator model defined
discriminator_model_optimizer = torch.optim.Adam(discriminator_model.parameters(), lr=0.000001)

In [8]:
# load the train dataset from folder dataset/captions_train2017.json
dataset = load_dataset('json', data_files='dataset/dataset.json')

train_dataset = dataset["train"]

#split the dataset into train and validation
dataset = train_dataset.train_test_split(test_size=0.1)

train_dataset = dataset["train"]
validation_dataset = dataset["test"]

#further split the train dataset into train and validation
dataset = train_dataset.train_test_split(test_size=0.1)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(train_dataset)
print(validation_dataset)
print(test_dataset)

torch.set_grad_enabled(True)
torch.autograd.set_detect_anomaly(True)


Dataset({
    features: ['caption', 'image'],
    num_rows: 239659
})
Dataset({
    features: ['caption', 'image'],
    num_rows: 29588
})
Dataset({
    features: ['caption', 'image'],
    num_rows: 26629
})


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x792b25baf6d0>

In [9]:
# Function to convert an image from a URL to a PyTorch Tensor
def url_to_tensor(url):

    img = Image.open(requests.get(url, stream=True).raw)

    # Define a transform to convert the image to tensor
    transform = transforms.Compose([
        transforms.Resize((768, 768)),  # Resize the image
        transforms.ToTensor()           # Convert the image to a PyTorch Tensor
    ])

    # Apply the transform to the image
    img_tensor = transform(img)

    return img_tensor

# Get the first image in the training dataset
img_tensor = url_to_tensor(train_dataset[0]['image'])

# Print the shape of the image tensor
print(img_tensor.shape)

torch.Size([3, 768, 768])


In [10]:
def show_tensor_image(pic):

    # Convert the tensor to a PIL Image
    if (isinstance(pic, torch.Tensor)):
        pil_image = transforms.ToPILImage()(pic)

    else:
        pil_image = pic

    # Display the image
    plt.imshow(pil_image)
    plt.axis('off')  # Turn off axis numbers
    plt.show()

def forward_noise(image, a, s, num_diffusion_steps):

    if (not isinstance(image, torch.Tensor)):
        transform = transforms.ToTensor()
        image = transform(image)

    for i in range(num_diffusion_steps):
        noise = np.random.normal(size=image.shape)  # noise mask
        image = image * a + noise * s

    return image

In [None]:
for epoch in range(100):
    for batch in train_dataset:

        image = url_to_tensor(batch["image"])
        caption = batch["caption"]

        # Creating labels for real and fake images
        valid = torch.ones(image.size(0), 1)
        fake = torch.zeros(image.size(0), 1)
        real_image = image

        # Displaying the base image and its caption
        print(f"Epoch {epoch}, Batch Caption: {caption}")
        #show_tensor_image(real_image)

        # Forward diffusion step
        print("Performing forward diffusion...")
        diffused_image = forward_noise(image, a=0.5, s=0.5, num_diffusion_steps=50)

        # Transforming images for distillation loss calculation
        transform = transforms.ToTensor()

        # Generating student image
        print("Generating student image...")
        student_image = student_model(caption, image=diffused_image, height=768, width=768, num_inference_steps=4).images[0]
        student_image_tensor = transform(student_image)
        #show_tensor_image(student_image_tensor)

        # Forward diffusion on student image
        print("Forward diffusion on student image...")
        new_diffused_image = forward_noise(student_image, a=0.5, s=0.5, num_diffusion_steps=50)

        # Generating teacher image
        print("Generating teacher image...")
        teacher_image = teacher_model(caption, image=new_diffused_image, height=768, width=768, num_inference_steps=100).images[0]
        teacher_image_tensor = transform(teacher_image)
        #show_tensor_image(teacher_image_tensor)

        # Discriminator predictions and loss for real image
        real_image_tensor = real_image.unsqueeze(0).type(torch.FloatTensor).to(device)
        #print(real_image_tensor.type())
        #print(real_image_tensor.shape)
        real_pred = discriminator_model(real_image_tensor)

        print(f"real pred = {real_pred}")
        print(f"real_pred[0][0][0] = {real_pred[0][0][0]}]")


        discriminator_real_loss = adversarial_loss_function(torch.tensor(real_pred[0][0][0][0]).to(device), torch.tensor(1.).to(device))

        print(f"Discriminator real Loss: {discriminator_real_loss}")

        # print(student_image.type())
        print(real_image.type())

        # Discriminator predictions and loss for student image
        student_image_tensor = transform(student_image).unsqueeze(0).type(torch.FloatTensor).to(device)
        #print(student_image_tensor.type())
        #print(student_image_tensor.shape)
        student_pred = discriminator_model(student_image_tensor)

        print(f"student pred = {student_pred}")
        print(f"student_pred[0][0][0] = {student_pred[0][0][0]}")

        discriminator_fake_loss = adversarial_loss_function(torch.tensor(student_pred[0][0][0][0]).to(device), torch.tensor(0.).to(device))

        #student_loss = adversarial_loss_function(student_pred[0][0], torch.tensor(1.))
        student_loss = adversarial_loss_function(torch.tensor(student_pred[0][0][0][0]).to(device), torch.tensor(1.).to(device))
        student_loss.requires_grad = True

        print(f"Discriminator Fake Loss: {discriminator_fake_loss}")
        print(f"Student Loss: {student_loss}")

        # Calculating adversarial loss
        adversarial_loss = (discriminator_real_loss + discriminator_fake_loss)/2
        adversarial_loss.requires_grad = True
        print(f"Adversarial Loss: {adversarial_loss}")

        # Computing distillation loss
        distillation_loss = distillation_loss_function(student_image_tensor.squeeze(0).to(device), teacher_image_tensor.to(device))
        distillation_loss.requires_grad = True
        print(f"Distillation Loss: {distillation_loss}")

        # Backpropagation for student model
        student_optimizer.zero_grad()
        student_loss.backward(retain_graph=True) # Retain graph because student model also needs to backpropagate in this iteration
        distillation_loss.backward()
        student_optimizer.step()

        # Backpropagation for discriminator model
        discriminator_model_optimizer.zero_grad()
        adversarial_loss.backward()
        discriminator_model_optimizer.step()

        print("End of batch processing.\n")


Epoch 0, Batch Caption: The bottom half of a tennis player holding a racket.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.1361e-11]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.1361e-11], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 25.200815200805664
torch.FloatTensor
student pred = tensor([[[[6.0254e-21]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)


  discriminator_real_loss = adversarial_loss_function(torch.tensor(real_pred[0][0][0][0]).to(device), torch.tensor(1.).to(device))


student_pred[0][0][0] = tensor([6.0254e-21], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 6.025395627647577e-21
Student Loss: 46.55830383300781
Adversarial Loss: 12.600407600402832
Distillation Loss: 0.8161792159080505
End of batch processing.



  discriminator_fake_loss = adversarial_loss_function(torch.tensor(student_pred[0][0][0][0]).to(device), torch.tensor(0.).to(device))
  student_loss = adversarial_loss_function(torch.tensor(student_pred[0][0][0][0]).to(device), torch.tensor(1.).to(device))


Epoch 0, Batch Caption: A surfer prepares for a wave in the middle of the ocean.  
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[0.0003]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([0.0003], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 8.203312873840332
torch.FloatTensor
student pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 100.0
Student Loss: 0.0
Adversarial Loss: 54.10165786743164
Distillation Loss: 0.8884787559509277
End of batch processing.

Epoch 0, Batch Caption: A man is walking across the street with a black dog and several white sheep.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.0000]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.0000], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 2.2650021492154337e-05
torch.FloatTensor
student pred = tensor([[[[2.8528e-12]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([2.8528e-12], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 2.8527737897660055e-12
Student Loss: 26.58272933959961
Adversarial Loss: 1.1325012565066572e-05
Distillation Loss: 0.8928778171539307
End of batch processing.

Epoch 0, Batch Caption: Two people sit on the beach with surfboards at their sides.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 0.0
torch.FloatTensor
student pred = tensor([[[[8.7251e-16]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([8.7251e-16], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 8.725050279232501e-16
Student Loss: 34.67516326904297
Adversarial Loss: 4.3625251396162503e-16
Distillation Loss: 0.8468051552772522
End of batch processing.

Epoch 0, Batch Caption: man holding an object in his hand that goes to his mouth
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[0.1056]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([0.1056], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 2.2485599517822266
torch.FloatTensor
student pred = tensor([[[[1.7233e-15]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.7233e-15], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 1.723342423660143e-15
Student Loss: 33.994510650634766
Adversarial Loss: 1.1242799758911133
Distillation Loss: 0.8101314902305603
End of batch processing.

Epoch 0, Batch Caption: Some people stand near several motorcycles outside a building.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[6.8339e-05]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([6.8339e-05], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 9.591033935546875
torch.FloatTensor
student pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 100.0
Student Loss: 0.0
Adversarial Loss: 54.79551696777344
Distillation Loss: 0.8859550952911377
End of batch processing.

Epoch 0, Batch Caption: The LG cell phone shows a date of January 27, 2010.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[7.6205e-10]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([7.6205e-10], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 20.99500274658203
torch.FloatTensor
student pred = tensor([[[[5.2922e-09]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([5.2922e-09], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 5.29220089973137e-09
Student Loss: 19.057031631469727
Adversarial Loss: 10.497501373291016
Distillation Loss: 0.80591881275177
End of batch processing.

Epoch 0, Batch Caption: A white refrigerator next to a counter top.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[5.9031e-29]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([5.9031e-29], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 64.99949645996094
torch.FloatTensor
student pred = tensor([[[[0.8161]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.8161], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 1.6935193538665771
Student Loss: 0.20318317413330078
Adversarial Loss: 33.34650802612305
Distillation Loss: 0.7887073159217834
End of batch processing.

Epoch 0, Batch Caption: A kitchen with a center island and light wood cabinets.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.9779e-08]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.9779e-08], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 17.738628387451172
torch.FloatTensor
student pred = tensor([[[[3.6885e-25]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([3.6885e-25], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 3.688487288341266e-25
Student Loss: 56.2594108581543
Adversarial Loss: 8.869314193725586
Distillation Loss: 0.7318432331085205
End of batch processing.

Epoch 0, Batch Caption: Two men sitting at desk holding laptops and wires.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.1318e-09]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.1318e-09], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 20.599464416503906
torch.FloatTensor
student pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 100.0
Student Loss: 0.0
Adversarial Loss: 60.29973220825195
Distillation Loss: 0.8077545166015625
End of batch processing.

Epoch 0, Batch Caption: It looks like the horses are hugging each other.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[0.0005]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([0.0005], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 7.6888813972473145
torch.FloatTensor
student pred = tensor([[[[6.1243e-26]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([6.1243e-26], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 6.12430623024377e-26
Student Loss: 58.05494689941406
Adversarial Loss: 3.8444406986236572
Distillation Loss: 0.7764391899108887
End of batch processing.

Epoch 0, Batch Caption: a man dressed in riot gear wearing a face mask and holding a red and white umbrella 
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 0.0
torch.FloatTensor
student pred = tensor([[[[0.4511]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.4511], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 0.5998616218566895
Student Loss: 0.7960386872291565
Adversarial Loss: 0.2999308109283447
Distillation Loss: 1.176058053970337
End of batch processing.

Epoch 0, Batch Caption: A man actively batting with the blur of the bat captured.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 0.0
torch.FloatTensor
student pred = tensor([[[[0.9590]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.9590], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 3.192969799041748
Student Loss: 0.04191611334681511
Adversarial Loss: 1.596484899520874
Distillation Loss: 0.6948183178901672
End of batch processing.

Epoch 0, Batch Caption: A person standing on the curb of a city street at night
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[2.0438e-31]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([2.0438e-31], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 70.66532135009766
torch.FloatTensor
student pred = tensor([[[[1.2943e-25]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.2943e-25], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 1.2942919758051663e-25
Student Loss: 57.306663513183594
Adversarial Loss: 35.33266067504883
Distillation Loss: 0.7398384809494019
End of batch processing.

Epoch 0, Batch Caption: A man standing holding a skateboard vertical in one hand.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[2.5130e-10]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([2.5130e-10], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 22.104389190673828
torch.FloatTensor
student pred = tensor([[[[0.0031]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.0031], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 0.003061062889173627
Student Loss: 5.790523052215576
Adversarial Loss: 11.053725242614746
Distillation Loss: 0.7982277870178223
End of batch processing.

Epoch 0, Batch Caption: A person relaxes with a laptop on a balcony overlooking a street.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[2.9342e-11]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([2.9342e-11], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 24.252002716064453
torch.FloatTensor
student pred = tensor([[[[1.]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 100.0
Student Loss: 0.0
Adversarial Loss: 62.125999450683594
Distillation Loss: 0.874968409538269
End of batch processing.

Epoch 0, Batch Caption: A herd of cows grazing in a field with power lines in background.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[0.9999]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([0.9999], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 0.00012088552466593683
torch.FloatTensor
student pred = tensor([[[[0.9990]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.9990], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 6.900759220123291
Student Loss: 0.0010075278114527464
Adversarial Loss: 3.4504401683807373
Distillation Loss: 0.6956738829612732
End of batch processing.

Epoch 0, Batch Caption: someone laying under a sheet on a bed watching television 
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.0000]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.0000], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 3.290230597485788e-05
torch.FloatTensor
student pred = tensor([[[[0.9971]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([0.9971], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 5.838715553283691
Student Loss: 0.002916830824688077
Adversarial Loss: 2.9193742275238037
Distillation Loss: 0.9276941418647766
End of batch processing.

Epoch 0, Batch Caption: A camera is sitting on a park bench.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[5.0941e-15]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([5.0941e-15], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 32.91068649291992
torch.FloatTensor
student pred = tensor([[[[2.4351e-17]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([2.4351e-17], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 2.435079268088102e-17
Student Loss: 38.25396728515625
Adversarial Loss: 16.45534324645996
Distillation Loss: 0.7452629804611206
End of batch processing.

Epoch 0, Batch Caption: There is a tree covered mountain behind the river.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]

real pred = tensor([[[[1.4784e-15]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
real_pred[0][0][0] = tensor([1.4784e-15], device='cuda:0', grad_fn=<SelectBackward0>)]
Discriminator real Loss: 34.14784240722656
torch.FloatTensor
student pred = tensor([[[[1.1055e-15]]]], device='cuda:0', grad_fn=<SigmoidBackward0>)
student_pred[0][0][0] = tensor([1.1055e-15], device='cuda:0', grad_fn=<SelectBackward0>)
Discriminator Fake Loss: 1.1054685228130997e-15
Student Loss: 34.438507080078125
Adversarial Loss: 17.07392120361328
Distillation Loss: 0.9687419533729553
End of batch processing.

Epoch 0, Batch Caption: A black and gold clock sits against an old building.
Performing forward diffusion...
Generating student image...


  0%|          | 0/4 [00:00<?, ?it/s]

Forward diffusion on student image...
Generating teacher image...


  0%|          | 0/100 [00:00<?, ?it/s]