In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torch.optim as optim
from tqdm import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [28]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim = 256, z_dim = 20):
        super(VAE, self).__init__()
        self.img2hid = nn.Linear(input_dim, hidden_dim)
        self.hid2mu = nn.Linear(hidden_dim, z_dim)
        self.hid2std = nn.Linear(hidden_dim, z_dim)
        
        self.z2hid = nn.Linear(z_dim, hidden_dim)
        self.hid2img = nn.Linear(hidden_dim, input_dim)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def Encode(self, x):
        enc_hidden = self.relu(self.img2hid(x))
        mu, std = self.hid2mu(enc_hidden), self.hid2std(enc_hidden)
        return mu, std
        
    def Decode(self, z):
        dec_hidden = self.relu(self.z2hid(z))
        return self.sigmoid(self.hid2img(dec_hidden))

    def forward(self, x):
        mu, std = self.Encode(x)
        epsilon = torch.rand_like(mu)
        reparametrized_z = mu + std * epsilon  #z is actual latent space representation
        output = self.Decode(reparametrized_z)
        return output, mu, std
        

In [29]:
x = torch.randn(4, 28 * 28)
vae = VAE(input_dim = 784, hidden_dim = 256, z_dim = 20)
out, _, _ = vae(x)
out.shape

torch.Size([4, 784])

In [30]:
input_dim = 784
h_dim = 256
z_dim = 20
batch_size = 32
lr = 3e-4
epochs = 10

In [37]:
dataset = datasets.MNIST(root = '/dataset', train = True, transform = transforms.ToTensor(), download = True)
train_loader = DataLoader(dataset = dataset, batch_size = batch_size, shuffle = True)
model = VAE(input_dim = input_dim, hidden_dim = h_dim, z_dim = z_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr = lr)
loss_fn = nn.BCELoss(reduction = 'sum')


In [38]:
for epoch in range(epochs):
    loop =  tqdm(enumerate(train_loader))
    print(epoch)
    for batch_idx, (x, _) in loop:
        x = x.to(device).view(x.shape[0], input_dim)
        x_reconstructed, mu, std = model(x)
        x_reconstruction_loss = loss_fn(x_reconstructed, x)
        kl_div_loss = -torch.sum(1 + torch.log(std.pow(2)) - mu.pow(2) - std.pow(2))
        loss = x_reconstruction_loss + kl_div_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss = loss.item())
        
        if batch_idx % 50 == 0:
            print(loss)
        

5it [00:00, 49.23it/s, loss=1.92e+4]

0
tensor(21014.7988, device='cuda:0', grad_fn=<AddBackward0>)


57it [00:00, 69.19it/s, loss=8.38e+3]

tensor(9809.7783, device='cuda:0', grad_fn=<AddBackward0>)


109it [00:01, 79.95it/s, loss=7.7e+3] 

tensor(7767.0356, device='cuda:0', grad_fn=<AddBackward0>)


165it [00:02, 89.92it/s, loss=7.27e+3]

tensor(7452.3320, device='cuda:0', grad_fn=<AddBackward0>)


213it [00:02, 87.13it/s, loss=6.97e+3]

tensor(6867.6660, device='cuda:0', grad_fn=<AddBackward0>)


263it [00:03, 93.94it/s, loss=6.68e+3]

tensor(6594.2939, device='cuda:0', grad_fn=<AddBackward0>)


313it [00:03, 96.15it/s, loss=6.07e+3]

tensor(6356.0215, device='cuda:0', grad_fn=<AddBackward0>)


367it [00:04, 98.60it/s, loss=6.21e+3]

tensor(6294.4771, device='cuda:0', grad_fn=<AddBackward0>)


407it [00:04, 90.18it/s, loss=5.78e+3]

tensor(6000.0117, device='cuda:0', grad_fn=<AddBackward0>)


458it [00:05, 94.19it/s, loss=5.74e+3]

tensor(5363.6211, device='cuda:0', grad_fn=<AddBackward0>)


509it [00:05, 94.27it/s, loss=5.63e+3]

tensor(6074.1699, device='cuda:0', grad_fn=<AddBackward0>)


562it [00:06, 101.09it/s, loss=5.52e+3]

tensor(5356.7305, device='cuda:0', grad_fn=<AddBackward0>)


616it [00:06, 99.13it/s, loss=5.17e+3] 

tensor(5239.0737, device='cuda:0', grad_fn=<AddBackward0>)


666it [00:07, 96.79it/s, loss=5.28e+3]

tensor(5329.2646, device='cuda:0', grad_fn=<AddBackward0>)


719it [00:07, 98.58it/s, loss=4.94e+3]

tensor(4945.9028, device='cuda:0', grad_fn=<AddBackward0>)


763it [00:08, 101.35it/s, loss=5.2e+3] 

tensor(5191.0664, device='cuda:0', grad_fn=<AddBackward0>)


818it [00:08, 102.68it/s, loss=4.65e+3]

tensor(5049.6440, device='cuda:0', grad_fn=<AddBackward0>)


862it [00:09, 100.27it/s, loss=4.74e+3]

tensor(4746.2563, device='cuda:0', grad_fn=<AddBackward0>)


917it [00:09, 101.88it/s, loss=4.35e+3]

tensor(4923.4741, device='cuda:0', grad_fn=<AddBackward0>)


961it [00:10, 102.72it/s, loss=4.79e+3]

tensor(4704.5825, device='cuda:0', grad_fn=<AddBackward0>)


1015it [00:10, 99.23it/s, loss=4.72e+3]

tensor(4691.7583, device='cuda:0', grad_fn=<AddBackward0>)


1065it [00:11, 89.87it/s, loss=4.11e+3]

tensor(4162.6743, device='cuda:0', grad_fn=<AddBackward0>)


1119it [00:11, 99.64it/s, loss=4.69e+3]

tensor(4588.2256, device='cuda:0', grad_fn=<AddBackward0>)


1163it [00:12, 102.65it/s, loss=4.73e+3]

tensor(4680.7236, device='cuda:0', grad_fn=<AddBackward0>)


1218it [00:12, 101.15it/s, loss=4.34e+3]

tensor(4397.9014, device='cuda:0', grad_fn=<AddBackward0>)


1262it [00:13, 100.33it/s, loss=4.24e+3]

tensor(3982.9968, device='cuda:0', grad_fn=<AddBackward0>)


1313it [00:13, 95.40it/s, loss=4.46e+3] 

tensor(4320.0444, device='cuda:0', grad_fn=<AddBackward0>)


1363it [00:14, 96.05it/s, loss=4.36e+3]

tensor(4130.7710, device='cuda:0', grad_fn=<AddBackward0>)


1412it [00:15, 89.29it/s, loss=4.33e+3]

tensor(4369.8706, device='cuda:0', grad_fn=<AddBackward0>)


1463it [00:15, 95.95it/s, loss=4.34e+3]

tensor(4318.1685, device='cuda:0', grad_fn=<AddBackward0>)


1513it [00:16, 93.93it/s, loss=3.97e+3]

tensor(4257.5679, device='cuda:0', grad_fn=<AddBackward0>)


1568it [00:16, 99.63it/s, loss=4.22e+3] 

tensor(4173.3018, device='cuda:0', grad_fn=<AddBackward0>)


1609it [00:17, 96.01it/s, loss=4.07e+3]

tensor(4365.9893, device='cuda:0', grad_fn=<AddBackward0>)


1660it [00:17, 89.42it/s, loss=4.06e+3]

tensor(4487.4966, device='cuda:0', grad_fn=<AddBackward0>)


1711it [00:18, 96.11it/s, loss=4.05e+3]

tensor(4239.5210, device='cuda:0', grad_fn=<AddBackward0>)


1761it [00:18, 92.55it/s, loss=4.07e+3]

tensor(4147.0894, device='cuda:0', grad_fn=<AddBackward0>)


1812it [00:19, 96.88it/s, loss=4.38e+3]

tensor(4031.5259, device='cuda:0', grad_fn=<AddBackward0>)


1865it [00:19, 98.94it/s, loss=4.1e+3] 

tensor(3822.9888, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:19, 94.85it/s, loss=4.26e+3]
9it [00:00, 86.78it/s, loss=4.17e+3]

1
tensor(4281.7148, device='cuda:0', grad_fn=<AddBackward0>)


60it [00:00, 95.05it/s, loss=4.06e+3]

tensor(4043.3015, device='cuda:0', grad_fn=<AddBackward0>)


112it [00:01, 98.36it/s, loss=4.46e+3]

tensor(4143.4429, device='cuda:0', grad_fn=<AddBackward0>)


165it [00:01, 99.43it/s, loss=4.06e+3] 

tensor(4097.5713, device='cuda:0', grad_fn=<AddBackward0>)


217it [00:02, 98.21it/s, loss=3.87e+3]

tensor(3820.7651, device='cuda:0', grad_fn=<AddBackward0>)


259it [00:02, 99.92it/s, loss=4.1e+3]  

tensor(4323.0420, device='cuda:0', grad_fn=<AddBackward0>)


314it [00:03, 99.56it/s, loss=4.01e+3] 

tensor(3824.1174, device='cuda:0', grad_fn=<AddBackward0>)


367it [00:03, 94.53it/s, loss=3.85e+3] 

tensor(3947.6719, device='cuda:0', grad_fn=<AddBackward0>)


410it [00:04, 99.24it/s, loss=4.19e+3]

tensor(3727.6560, device='cuda:0', grad_fn=<AddBackward0>)


464it [00:04, 99.40it/s, loss=3.8e+3]  

tensor(3802.7024, device='cuda:0', grad_fn=<AddBackward0>)


516it [00:05, 97.88it/s, loss=3.76e+3]

tensor(4095.6685, device='cuda:0', grad_fn=<AddBackward0>)


559it [00:05, 99.79it/s, loss=3.81e+3] 

tensor(4151.2305, device='cuda:0', grad_fn=<AddBackward0>)


612it [00:06, 98.34it/s, loss=3.98e+3] 

tensor(4018.4460, device='cuda:0', grad_fn=<AddBackward0>)


663it [00:06, 93.64it/s, loss=3.78e+3]

tensor(3699.1292, device='cuda:0', grad_fn=<AddBackward0>)


715it [00:07, 97.03it/s, loss=3.61e+3]

tensor(3566.5083, device='cuda:0', grad_fn=<AddBackward0>)


759it [00:07, 101.20it/s, loss=3.9e+3] 

tensor(4286.6006, device='cuda:0', grad_fn=<AddBackward0>)


811it [00:08, 88.15it/s, loss=3.96e+3] 

tensor(3809.1418, device='cuda:0', grad_fn=<AddBackward0>)


863it [00:08, 95.46it/s, loss=3.65e+3]

tensor(3851.0715, device='cuda:0', grad_fn=<AddBackward0>)


919it [00:09, 105.81it/s, loss=3.56e+3]

tensor(3662.2725, device='cuda:0', grad_fn=<AddBackward0>)


967it [00:09, 113.71it/s, loss=4.02e+3]

tensor(3673.5964, device='cuda:0', grad_fn=<AddBackward0>)


1015it [00:10, 116.41it/s, loss=3.87e+3]

tensor(3737.8877, device='cuda:0', grad_fn=<AddBackward0>)


1063it [00:10, 113.94it/s, loss=4.11e+3]

tensor(4199.6572, device='cuda:0', grad_fn=<AddBackward0>)


1111it [00:11, 114.08it/s, loss=3.71e+3]

tensor(3539.5115, device='cuda:0', grad_fn=<AddBackward0>)


1173it [00:11, 118.29it/s, loss=3.56e+3]

tensor(3759.7727, device='cuda:0', grad_fn=<AddBackward0>)


1222it [00:12, 118.87it/s, loss=3.63e+3]

tensor(3913.4490, device='cuda:0', grad_fn=<AddBackward0>)


1270it [00:12, 115.41it/s, loss=3.49e+3]

tensor(4017.9761, device='cuda:0', grad_fn=<AddBackward0>)


1319it [00:12, 117.82it/s, loss=3.71e+3]

tensor(3941.9504, device='cuda:0', grad_fn=<AddBackward0>)


1367it [00:13, 115.68it/s, loss=4.04e+3]

tensor(3704.1550, device='cuda:0', grad_fn=<AddBackward0>)


1415it [00:13, 115.96it/s, loss=3.82e+3]

tensor(3608.2222, device='cuda:0', grad_fn=<AddBackward0>)


1463it [00:14, 116.88it/s, loss=3.9e+3] 

tensor(3812.1616, device='cuda:0', grad_fn=<AddBackward0>)


1512it [00:14, 118.64it/s, loss=3.82e+3]

tensor(3942.0552, device='cuda:0', grad_fn=<AddBackward0>)


1572it [00:15, 117.02it/s, loss=3.65e+3]

tensor(3887.5737, device='cuda:0', grad_fn=<AddBackward0>)


1621it [00:15, 118.92it/s, loss=3.27e+3]

tensor(3643.3896, device='cuda:0', grad_fn=<AddBackward0>)


1669it [00:15, 116.94it/s, loss=3.87e+3]

tensor(3932.0564, device='cuda:0', grad_fn=<AddBackward0>)


1717it [00:16, 114.70it/s, loss=3.64e+3]

tensor(3523.8628, device='cuda:0', grad_fn=<AddBackward0>)


1766it [00:16, 117.41it/s, loss=3.26e+3]

tensor(3616.1851, device='cuda:0', grad_fn=<AddBackward0>)


1814it [00:17, 117.44it/s, loss=3.56e+3]

tensor(3508.5371, device='cuda:0', grad_fn=<AddBackward0>)


1862it [00:17, 114.24it/s, loss=3.8e+3] 

tensor(3747.4937, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:17, 106.26it/s, loss=3.82e+3]
11it [00:00, 107.02it/s, loss=3.4e+3] 

2
tensor(3458.8489, device='cuda:0', grad_fn=<AddBackward0>)


70it [00:00, 109.96it/s, loss=3.71e+3]

tensor(3389.5898, device='cuda:0', grad_fn=<AddBackward0>)


118it [00:01, 115.43it/s, loss=4e+3]   

tensor(3552.1758, device='cuda:0', grad_fn=<AddBackward0>)


166it [00:01, 113.47it/s, loss=3.62e+3]

tensor(3366.2117, device='cuda:0', grad_fn=<AddBackward0>)


214it [00:01, 115.01it/s, loss=3.94e+3]

tensor(3585.1489, device='cuda:0', grad_fn=<AddBackward0>)


262it [00:02, 115.69it/s, loss=3.63e+3]

tensor(3681.5122, device='cuda:0', grad_fn=<AddBackward0>)


310it [00:02, 117.24it/s, loss=3.47e+3]

tensor(3461.3633, device='cuda:0', grad_fn=<AddBackward0>)


371it [00:03, 116.78it/s, loss=3.48e+3]

tensor(3516.6953, device='cuda:0', grad_fn=<AddBackward0>)


419it [00:03, 114.21it/s, loss=3.37e+3]

tensor(3522.3928, device='cuda:0', grad_fn=<AddBackward0>)


467it [00:04, 114.00it/s, loss=3.15e+3]

tensor(3622.9731, device='cuda:0', grad_fn=<AddBackward0>)


515it [00:04, 117.44it/s, loss=3.74e+3]

tensor(3299.8750, device='cuda:0', grad_fn=<AddBackward0>)


563it [00:04, 115.95it/s, loss=3.63e+3]

tensor(3923.6396, device='cuda:0', grad_fn=<AddBackward0>)


611it [00:05, 115.27it/s, loss=3.48e+3]

tensor(3265.6860, device='cuda:0', grad_fn=<AddBackward0>)


661it [00:05, 118.95it/s, loss=3.59e+3]

tensor(3406.0588, device='cuda:0', grad_fn=<AddBackward0>)


721it [00:06, 117.13it/s, loss=3.53e+3]

tensor(3755.6433, device='cuda:0', grad_fn=<AddBackward0>)


769it [00:06, 115.58it/s, loss=3.04e+3]

tensor(3521.6050, device='cuda:0', grad_fn=<AddBackward0>)


817it [00:07, 117.47it/s, loss=3.51e+3]

tensor(3225.8193, device='cuda:0', grad_fn=<AddBackward0>)


865it [00:07, 117.28it/s, loss=3.46e+3]

tensor(3489.2397, device='cuda:0', grad_fn=<AddBackward0>)


913it [00:07, 117.06it/s, loss=3.41e+3]

tensor(3392.7739, device='cuda:0', grad_fn=<AddBackward0>)


961it [00:08, 117.10it/s, loss=3.42e+3]

tensor(2955.5742, device='cuda:0', grad_fn=<AddBackward0>)


1021it [00:08, 113.67it/s, loss=3.59e+3]

tensor(3563.5872, device='cuda:0', grad_fn=<AddBackward0>)


1069it [00:09, 113.02it/s, loss=3.34e+3]

tensor(3423.8809, device='cuda:0', grad_fn=<AddBackward0>)


1117it [00:09, 110.89it/s, loss=3.44e+3]

tensor(3754.4548, device='cuda:0', grad_fn=<AddBackward0>)


1165it [00:10, 111.21it/s, loss=3.3e+3] 

tensor(3364.1357, device='cuda:0', grad_fn=<AddBackward0>)


1213it [00:10, 111.83it/s, loss=3.31e+3]

tensor(3416.0361, device='cuda:0', grad_fn=<AddBackward0>)


1261it [00:11, 114.01it/s, loss=3.27e+3]

tensor(3391.4395, device='cuda:0', grad_fn=<AddBackward0>)


1321it [00:11, 115.75it/s, loss=3.26e+3]

tensor(3740.1455, device='cuda:0', grad_fn=<AddBackward0>)


1370it [00:11, 115.89it/s, loss=3.61e+3]

tensor(3470.9387, device='cuda:0', grad_fn=<AddBackward0>)


1418it [00:12, 116.26it/s, loss=3.62e+3]

tensor(3465.2380, device='cuda:0', grad_fn=<AddBackward0>)


1467it [00:12, 114.05it/s, loss=3.28e+3]

tensor(3638.6436, device='cuda:0', grad_fn=<AddBackward0>)


1515it [00:13, 116.80it/s, loss=3.19e+3]

tensor(3401.5928, device='cuda:0', grad_fn=<AddBackward0>)


1564it [00:13, 117.02it/s, loss=3.41e+3]

tensor(3493.6179, device='cuda:0', grad_fn=<AddBackward0>)


1613it [00:14, 118.13it/s, loss=3.26e+3]

tensor(3723.5896, device='cuda:0', grad_fn=<AddBackward0>)


1661it [00:14, 115.00it/s, loss=3.35e+3]

tensor(2987.5391, device='cuda:0', grad_fn=<AddBackward0>)


1721it [00:14, 115.47it/s, loss=3.6e+3] 

tensor(3662.5439, device='cuda:0', grad_fn=<AddBackward0>)


1771it [00:15, 118.67it/s, loss=3.6e+3] 

tensor(3482.5188, device='cuda:0', grad_fn=<AddBackward0>)


1819it [00:15, 113.83it/s, loss=3.41e+3]

tensor(3642.9819, device='cuda:0', grad_fn=<AddBackward0>)


1868it [00:16, 115.79it/s, loss=3.36e+3]

tensor(3353.6123, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:16, 115.09it/s, loss=3.52e+3]
11it [00:00, 102.11it/s, loss=3.26e+3]

3
tensor(3663.2913, device='cuda:0', grad_fn=<AddBackward0>)


71it [00:00, 116.70it/s, loss=3.25e+3]

tensor(3406.9822, device='cuda:0', grad_fn=<AddBackward0>)


119it [00:01, 114.34it/s, loss=3.5e+3] 

tensor(3767.6677, device='cuda:0', grad_fn=<AddBackward0>)


169it [00:01, 118.27it/s, loss=3.28e+3]

tensor(3463.4016, device='cuda:0', grad_fn=<AddBackward0>)


218it [00:01, 119.89it/s, loss=3.47e+3]

tensor(3369.4973, device='cuda:0', grad_fn=<AddBackward0>)


266it [00:02, 114.45it/s, loss=3.46e+3]

tensor(3813.3735, device='cuda:0', grad_fn=<AddBackward0>)


314it [00:02, 116.93it/s, loss=3.48e+3]

tensor(3327.2378, device='cuda:0', grad_fn=<AddBackward0>)


364it [00:03, 119.34it/s, loss=3.71e+3]

tensor(3534.5955, device='cuda:0', grad_fn=<AddBackward0>)


412it [00:03, 118.81it/s, loss=3.49e+3]

tensor(3558.3049, device='cuda:0', grad_fn=<AddBackward0>)


462it [00:04, 119.16it/s, loss=3.36e+3]

tensor(3189.2300, device='cuda:0', grad_fn=<AddBackward0>)


523it [00:04, 117.62it/s, loss=3.49e+3]

tensor(3156.5955, device='cuda:0', grad_fn=<AddBackward0>)


572it [00:04, 117.96it/s, loss=3.2e+3] 

tensor(3387.1531, device='cuda:0', grad_fn=<AddBackward0>)


621it [00:05, 119.04it/s, loss=3.57e+3]

tensor(3617.7705, device='cuda:0', grad_fn=<AddBackward0>)


669it [00:05, 118.48it/s, loss=3.33e+3]

tensor(3395.0371, device='cuda:0', grad_fn=<AddBackward0>)


718it [00:06, 117.75it/s, loss=3.39e+3]

tensor(3382.9521, device='cuda:0', grad_fn=<AddBackward0>)


766it [00:06, 117.84it/s, loss=3.39e+3]

tensor(3606.2795, device='cuda:0', grad_fn=<AddBackward0>)


816it [00:06, 119.59it/s, loss=3.51e+3]

tensor(3720.4768, device='cuda:0', grad_fn=<AddBackward0>)


864it [00:07, 118.43it/s, loss=3.57e+3]

tensor(3471.1663, device='cuda:0', grad_fn=<AddBackward0>)


912it [00:07, 114.70it/s, loss=3.18e+3]

tensor(3538.9692, device='cuda:0', grad_fn=<AddBackward0>)


960it [00:08, 111.56it/s, loss=3.48e+3]

tensor(3314.1899, device='cuda:0', grad_fn=<AddBackward0>)


1019it [00:08, 109.75it/s, loss=3.43e+3]

tensor(3603.9893, device='cuda:0', grad_fn=<AddBackward0>)


1068it [00:09, 114.21it/s, loss=3.41e+3]

tensor(3603.2627, device='cuda:0', grad_fn=<AddBackward0>)


1116it [00:09, 117.02it/s, loss=3.3e+3] 

tensor(3260.3125, device='cuda:0', grad_fn=<AddBackward0>)


1166it [00:10, 118.53it/s, loss=3.01e+3]

tensor(3288.7109, device='cuda:0', grad_fn=<AddBackward0>)


1215it [00:10, 116.80it/s, loss=3.13e+3]

tensor(3304.4812, device='cuda:0', grad_fn=<AddBackward0>)


1264it [00:10, 118.07it/s, loss=3.01e+3]

tensor(3709.9307, device='cuda:0', grad_fn=<AddBackward0>)


1313it [00:11, 118.20it/s, loss=3.33e+3]

tensor(3493.5188, device='cuda:0', grad_fn=<AddBackward0>)


1362it [00:11, 118.48it/s, loss=3.7e+3] 

tensor(3355.1179, device='cuda:0', grad_fn=<AddBackward0>)


1423it [00:12, 119.05it/s, loss=3.19e+3]

tensor(3290.7659, device='cuda:0', grad_fn=<AddBackward0>)


1472it [00:12, 115.13it/s, loss=3.31e+3]

tensor(3123.5618, device='cuda:0', grad_fn=<AddBackward0>)


1521it [00:13, 118.07it/s, loss=3.47e+3]

tensor(3507.4854, device='cuda:0', grad_fn=<AddBackward0>)


1569it [00:13, 117.78it/s, loss=3.41e+3]

tensor(3455.9365, device='cuda:0', grad_fn=<AddBackward0>)


1618it [00:13, 118.77it/s, loss=3.24e+3]

tensor(3209.9556, device='cuda:0', grad_fn=<AddBackward0>)


1666it [00:14, 117.24it/s, loss=3.51e+3]

tensor(3100.5808, device='cuda:0', grad_fn=<AddBackward0>)


1714it [00:14, 118.35it/s, loss=3.28e+3]

tensor(3194.1843, device='cuda:0', grad_fn=<AddBackward0>)


1764it [00:15, 119.34it/s, loss=3.38e+3]

tensor(3188.7734, device='cuda:0', grad_fn=<AddBackward0>)


1813it [00:15, 119.62it/s, loss=3.38e+3]

tensor(3683.2832, device='cuda:0', grad_fn=<AddBackward0>)


1862it [00:15, 116.93it/s, loss=3.73e+3]

tensor(3452.5044, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:15, 117.26it/s, loss=3.43e+3]
11it [00:00, 106.76it/s, loss=3.48e+3]

4
tensor(3592.1519, device='cuda:0', grad_fn=<AddBackward0>)


61it [00:00, 117.58it/s, loss=3.4e+3] 

tensor(3816.3198, device='cuda:0', grad_fn=<AddBackward0>)


122it [00:01, 118.38it/s, loss=3.07e+3]

tensor(3442.2566, device='cuda:0', grad_fn=<AddBackward0>)


158it [00:01, 113.60it/s, loss=3.13e+3]

tensor(3370.1328, device='cuda:0', grad_fn=<AddBackward0>)


217it [00:01, 112.54it/s, loss=3.4e+3] 

tensor(3376.0100, device='cuda:0', grad_fn=<AddBackward0>)


265it [00:02, 107.36it/s, loss=3.16e+3]

tensor(3257.2053, device='cuda:0', grad_fn=<AddBackward0>)


313it [00:02, 113.21it/s, loss=3.18e+3]

tensor(3329.9951, device='cuda:0', grad_fn=<AddBackward0>)


361it [00:03, 114.05it/s, loss=3.4e+3] 

tensor(3383.9578, device='cuda:0', grad_fn=<AddBackward0>)


421it [00:03, 114.92it/s, loss=3.33e+3]

tensor(3369.2603, device='cuda:0', grad_fn=<AddBackward0>)


469it [00:04, 116.02it/s, loss=3.3e+3] 

tensor(3208.6982, device='cuda:0', grad_fn=<AddBackward0>)


517it [00:04, 116.16it/s, loss=3.22e+3]

tensor(3306.1321, device='cuda:0', grad_fn=<AddBackward0>)


565it [00:05, 116.30it/s, loss=3.34e+3]

tensor(3091.8779, device='cuda:0', grad_fn=<AddBackward0>)


613it [00:05, 115.81it/s, loss=3.36e+3]

tensor(3348.6812, device='cuda:0', grad_fn=<AddBackward0>)


661it [00:05, 115.04it/s, loss=3.37e+3]

tensor(3377.8203, device='cuda:0', grad_fn=<AddBackward0>)


710it [00:06, 115.46it/s, loss=3.38e+3]

tensor(3489.6299, device='cuda:0', grad_fn=<AddBackward0>)


770it [00:06, 112.93it/s, loss=3.1e+3] 

tensor(3442.6560, device='cuda:0', grad_fn=<AddBackward0>)


818it [00:07, 116.41it/s, loss=3.75e+3]

tensor(3289.5239, device='cuda:0', grad_fn=<AddBackward0>)


866it [00:07, 114.55it/s, loss=3.31e+3]

tensor(3420.3867, device='cuda:0', grad_fn=<AddBackward0>)


914it [00:08, 114.94it/s, loss=3.15e+3]

tensor(3331.0166, device='cuda:0', grad_fn=<AddBackward0>)


964it [00:08, 118.05it/s, loss=3.42e+3]

tensor(3468.1035, device='cuda:0', grad_fn=<AddBackward0>)


1012it [00:08, 118.47it/s, loss=3.27e+3]

tensor(3387.3962, device='cuda:0', grad_fn=<AddBackward0>)


1060it [00:09, 116.37it/s, loss=3.42e+3]

tensor(3226.9473, device='cuda:0', grad_fn=<AddBackward0>)


1122it [00:09, 117.68it/s, loss=3.58e+3]

tensor(3257.5281, device='cuda:0', grad_fn=<AddBackward0>)


1171it [00:10, 117.56it/s, loss=3.28e+3]

tensor(2946.6675, device='cuda:0', grad_fn=<AddBackward0>)


1220it [00:10, 118.54it/s, loss=3.24e+3]

tensor(3247.5833, device='cuda:0', grad_fn=<AddBackward0>)


1269it [00:11, 117.48it/s, loss=3.24e+3]

tensor(3230.8242, device='cuda:0', grad_fn=<AddBackward0>)


1319it [00:11, 117.31it/s, loss=3.38e+3]

tensor(3386.5068, device='cuda:0', grad_fn=<AddBackward0>)


1367it [00:11, 118.59it/s, loss=3.5e+3] 

tensor(3510.5769, device='cuda:0', grad_fn=<AddBackward0>)


1415it [00:12, 117.79it/s, loss=3.21e+3]

tensor(3486.3120, device='cuda:0', grad_fn=<AddBackward0>)


1463it [00:12, 114.98it/s, loss=3.2e+3] 

tensor(3319.3354, device='cuda:0', grad_fn=<AddBackward0>)


1512it [00:13, 118.29it/s, loss=3.26e+3]

tensor(3156.0151, device='cuda:0', grad_fn=<AddBackward0>)


1572it [00:13, 117.92it/s, loss=3.28e+3]

tensor(3139.9573, device='cuda:0', grad_fn=<AddBackward0>)


1621it [00:14, 117.39it/s, loss=3.26e+3]

tensor(3452.9417, device='cuda:0', grad_fn=<AddBackward0>)


1670it [00:14, 118.09it/s, loss=3.26e+3]

tensor(3153.2166, device='cuda:0', grad_fn=<AddBackward0>)


1718it [00:14, 116.95it/s, loss=3.46e+3]

tensor(3462.9795, device='cuda:0', grad_fn=<AddBackward0>)


1766it [00:15, 114.83it/s, loss=3.29e+3]

tensor(3131.3760, device='cuda:0', grad_fn=<AddBackward0>)


1815it [00:15, 116.41it/s, loss=3.35e+3]

tensor(3476.4778, device='cuda:0', grad_fn=<AddBackward0>)


1863it [00:16, 116.02it/s, loss=3.27e+3]

tensor(3218.5400, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:16, 115.99it/s, loss=3.2e+3] 
12it [00:00, 112.66it/s, loss=3.43e+3]

5
tensor(3351.6987, device='cuda:0', grad_fn=<AddBackward0>)


61it [00:00, 111.74it/s, loss=3.39e+3]

tensor(3296.3813, device='cuda:0', grad_fn=<AddBackward0>)


110it [00:01, 116.56it/s, loss=3.18e+3]

tensor(3248.9026, device='cuda:0', grad_fn=<AddBackward0>)


170it [00:01, 116.98it/s, loss=3.28e+3]

tensor(3141.0920, device='cuda:0', grad_fn=<AddBackward0>)


218it [00:01, 112.32it/s, loss=3.35e+3]

tensor(3258.7039, device='cuda:0', grad_fn=<AddBackward0>)


266it [00:02, 115.16it/s, loss=3.17e+3]

tensor(3321.6348, device='cuda:0', grad_fn=<AddBackward0>)


314it [00:02, 116.70it/s, loss=2.95e+3]

tensor(3379.5503, device='cuda:0', grad_fn=<AddBackward0>)


364it [00:03, 120.01it/s, loss=2.85e+3]

tensor(3034.2856, device='cuda:0', grad_fn=<AddBackward0>)


415it [00:03, 119.07it/s, loss=3.4e+3] 

tensor(2974.5586, device='cuda:0', grad_fn=<AddBackward0>)


463it [00:04, 117.74it/s, loss=3.48e+3]

tensor(3243.1521, device='cuda:0', grad_fn=<AddBackward0>)


512it [00:04, 118.44it/s, loss=3.28e+3]

tensor(3417.2463, device='cuda:0', grad_fn=<AddBackward0>)


562it [00:04, 119.03it/s, loss=3.22e+3]

tensor(3463.8835, device='cuda:0', grad_fn=<AddBackward0>)


614it [00:05, 120.38it/s, loss=3.68e+3]

tensor(3284.3496, device='cuda:0', grad_fn=<AddBackward0>)


664it [00:05, 118.95it/s, loss=3.43e+3]

tensor(3139.4690, device='cuda:0', grad_fn=<AddBackward0>)


714it [00:06, 118.65it/s, loss=3.15e+3]

tensor(3064.6565, device='cuda:0', grad_fn=<AddBackward0>)


762it [00:06, 116.62it/s, loss=3.32e+3]

tensor(3270.6624, device='cuda:0', grad_fn=<AddBackward0>)


810it [00:07, 118.18it/s, loss=3.36e+3]

tensor(3486.0613, device='cuda:0', grad_fn=<AddBackward0>)


872it [00:07, 120.41it/s, loss=3.3e+3] 

tensor(3264.4500, device='cuda:0', grad_fn=<AddBackward0>)


922it [00:07, 119.97it/s, loss=3.19e+3]

tensor(3227.8513, device='cuda:0', grad_fn=<AddBackward0>)


960it [00:08, 118.32it/s, loss=3.3e+3] 

tensor(3314.0762, device='cuda:0', grad_fn=<AddBackward0>)


1021it [00:08, 119.01it/s, loss=3.45e+3]

tensor(3223.0522, device='cuda:0', grad_fn=<AddBackward0>)


1060it [00:09, 119.75it/s, loss=3.14e+3]

tensor(3340.7224, device='cuda:0', grad_fn=<AddBackward0>)


1122it [00:09, 118.33it/s, loss=3.5e+3] 

tensor(3231.7197, device='cuda:0', grad_fn=<AddBackward0>)


1171it [00:09, 116.97it/s, loss=3.21e+3]

tensor(3321.3706, device='cuda:0', grad_fn=<AddBackward0>)


1220it [00:10, 116.61it/s, loss=3.11e+3]

tensor(3283.2559, device='cuda:0', grad_fn=<AddBackward0>)


1268it [00:10, 115.92it/s, loss=3.18e+3]

tensor(3252.6826, device='cuda:0', grad_fn=<AddBackward0>)


1317it [00:11, 118.76it/s, loss=3.1e+3] 

tensor(3072.5071, device='cuda:0', grad_fn=<AddBackward0>)


1365it [00:11, 117.40it/s, loss=3.33e+3]

tensor(3324.8167, device='cuda:0', grad_fn=<AddBackward0>)


1413it [00:12, 117.13it/s, loss=3e+3]   

tensor(3199.0005, device='cuda:0', grad_fn=<AddBackward0>)


1461it [00:12, 115.23it/s, loss=3.35e+3]

tensor(3373.4912, device='cuda:0', grad_fn=<AddBackward0>)


1522it [00:12, 118.99it/s, loss=3.21e+3]

tensor(3362.1125, device='cuda:0', grad_fn=<AddBackward0>)


1572it [00:13, 119.56it/s, loss=3.29e+3]

tensor(3329.3994, device='cuda:0', grad_fn=<AddBackward0>)


1622it [00:13, 118.29it/s, loss=3.44e+3]

tensor(3346.8027, device='cuda:0', grad_fn=<AddBackward0>)


1671it [00:14, 118.23it/s, loss=3.32e+3]

tensor(3288.5789, device='cuda:0', grad_fn=<AddBackward0>)


1722it [00:14, 119.92it/s, loss=3.12e+3]

tensor(3086.5469, device='cuda:0', grad_fn=<AddBackward0>)


1760it [00:15, 119.65it/s, loss=3.29e+3]

tensor(3324.6677, device='cuda:0', grad_fn=<AddBackward0>)


1811it [00:15, 119.06it/s, loss=3.25e+3]

tensor(3175.9172, device='cuda:0', grad_fn=<AddBackward0>)


1861it [00:15, 119.16it/s, loss=3.24e+3]

tensor(3236.9644, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:15, 117.85it/s, loss=3.38e+3]
11it [00:00, 107.30it/s, loss=2.94e+3]

6
tensor(3325.4617, device='cuda:0', grad_fn=<AddBackward0>)


61it [00:00, 117.29it/s, loss=3.53e+3]

tensor(3473.3027, device='cuda:0', grad_fn=<AddBackward0>)


122it [00:01, 118.11it/s, loss=3.24e+3]

tensor(3176.0447, device='cuda:0', grad_fn=<AddBackward0>)


171it [00:01, 118.82it/s, loss=3.26e+3]

tensor(3057.0703, device='cuda:0', grad_fn=<AddBackward0>)


220it [00:01, 118.74it/s, loss=3.11e+3]

tensor(3361.3953, device='cuda:0', grad_fn=<AddBackward0>)


270it [00:02, 118.23it/s, loss=2.98e+3]

tensor(3366.6182, device='cuda:0', grad_fn=<AddBackward0>)


319it [00:02, 116.92it/s, loss=3.28e+3]

tensor(3323.0825, device='cuda:0', grad_fn=<AddBackward0>)


368it [00:03, 118.28it/s, loss=3.17e+3]

tensor(3456.4988, device='cuda:0', grad_fn=<AddBackward0>)


416it [00:03, 119.11it/s, loss=3.3e+3] 

tensor(3046.0537, device='cuda:0', grad_fn=<AddBackward0>)


468it [00:04, 119.05it/s, loss=3.21e+3]

tensor(3096.6755, device='cuda:0', grad_fn=<AddBackward0>)


517it [00:04, 117.23it/s, loss=3.28e+3]

tensor(3359.3494, device='cuda:0', grad_fn=<AddBackward0>)


565it [00:04, 114.44it/s, loss=3.16e+3]

tensor(3410.1897, device='cuda:0', grad_fn=<AddBackward0>)


613it [00:05, 116.11it/s, loss=3.18e+3]

tensor(3077.9202, device='cuda:0', grad_fn=<AddBackward0>)


665it [00:05, 119.29it/s, loss=3.27e+3]

tensor(3242.1157, device='cuda:0', grad_fn=<AddBackward0>)


716it [00:06, 118.79it/s, loss=3.15e+3]

tensor(3236.6284, device='cuda:0', grad_fn=<AddBackward0>)


764it [00:06, 118.19it/s, loss=3.28e+3]

tensor(3314.1929, device='cuda:0', grad_fn=<AddBackward0>)


813it [00:06, 117.07it/s, loss=3.54e+3]

tensor(3060.5156, device='cuda:0', grad_fn=<AddBackward0>)


873it [00:07, 117.26it/s, loss=3.11e+3]

tensor(3214.1138, device='cuda:0', grad_fn=<AddBackward0>)


921it [00:07, 116.42it/s, loss=3.11e+3]

tensor(3457.4971, device='cuda:0', grad_fn=<AddBackward0>)


971it [00:08, 117.77it/s, loss=3.04e+3]

tensor(3237.6323, device='cuda:0', grad_fn=<AddBackward0>)


1019it [00:08, 117.40it/s, loss=3.12e+3]

tensor(3313.4663, device='cuda:0', grad_fn=<AddBackward0>)


1067it [00:09, 117.37it/s, loss=2.94e+3]

tensor(3231.2793, device='cuda:0', grad_fn=<AddBackward0>)


1116it [00:09, 118.68it/s, loss=3.33e+3]

tensor(3425.8145, device='cuda:0', grad_fn=<AddBackward0>)


1164it [00:09, 116.81it/s, loss=3.26e+3]

tensor(3251.5166, device='cuda:0', grad_fn=<AddBackward0>)


1213it [00:10, 118.01it/s, loss=3.38e+3]

tensor(3384.4810, device='cuda:0', grad_fn=<AddBackward0>)


1262it [00:10, 117.44it/s, loss=3.34e+3]

tensor(3193.8428, device='cuda:0', grad_fn=<AddBackward0>)


1311it [00:11, 117.42it/s, loss=3.29e+3]

tensor(3201.9856, device='cuda:0', grad_fn=<AddBackward0>)


1361it [00:11, 118.27it/s, loss=3.21e+3]

tensor(2986.2415, device='cuda:0', grad_fn=<AddBackward0>)


1412it [00:12, 119.70it/s, loss=3.39e+3]

tensor(3078.4424, device='cuda:0', grad_fn=<AddBackward0>)


1472it [00:12, 118.98it/s, loss=3.2e+3] 

tensor(3421.7009, device='cuda:0', grad_fn=<AddBackward0>)


1510it [00:12, 119.76it/s, loss=3.03e+3]

tensor(3235.3311, device='cuda:0', grad_fn=<AddBackward0>)


1562it [00:13, 119.44it/s, loss=3.24e+3]

tensor(3019.4048, device='cuda:0', grad_fn=<AddBackward0>)


1622it [00:13, 118.62it/s, loss=3.16e+3]

tensor(3330.3772, device='cuda:0', grad_fn=<AddBackward0>)


1671it [00:14, 119.46it/s, loss=3.1e+3] 

tensor(3172.4243, device='cuda:0', grad_fn=<AddBackward0>)


1719it [00:14, 119.15it/s, loss=2.94e+3]

tensor(3310.1030, device='cuda:0', grad_fn=<AddBackward0>)


1769it [00:15, 116.98it/s, loss=3.06e+3]

tensor(3170.2129, device='cuda:0', grad_fn=<AddBackward0>)


1818it [00:15, 117.90it/s, loss=3.25e+3]

tensor(3364.1318, device='cuda:0', grad_fn=<AddBackward0>)


1868it [00:15, 118.66it/s, loss=3.02e+3]

tensor(2999.8796, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:15, 118.10it/s, loss=3.44e+3]
11it [00:00, 108.32it/s, loss=3.41e+3]

7
tensor(3447.6519, device='cuda:0', grad_fn=<AddBackward0>)


72it [00:00, 114.71it/s, loss=3.09e+3]

tensor(3154.9380, device='cuda:0', grad_fn=<AddBackward0>)


120it [00:01, 117.36it/s, loss=3.3e+3] 

tensor(3080.1455, device='cuda:0', grad_fn=<AddBackward0>)


169it [00:01, 116.88it/s, loss=2.93e+3]

tensor(3054.2686, device='cuda:0', grad_fn=<AddBackward0>)


217it [00:01, 117.84it/s, loss=3.14e+3]

tensor(3308.2759, device='cuda:0', grad_fn=<AddBackward0>)


267it [00:02, 118.80it/s, loss=3.24e+3]

tensor(3064.9668, device='cuda:0', grad_fn=<AddBackward0>)


315it [00:02, 118.35it/s, loss=3.41e+3]

tensor(3197.4390, device='cuda:0', grad_fn=<AddBackward0>)


364it [00:03, 119.00it/s, loss=3.23e+3]

tensor(3506.6914, device='cuda:0', grad_fn=<AddBackward0>)


413it [00:03, 115.78it/s, loss=3.22e+3]

tensor(3487.7136, device='cuda:0', grad_fn=<AddBackward0>)


461it [00:04, 115.76it/s, loss=3.13e+3]

tensor(3104.1609, device='cuda:0', grad_fn=<AddBackward0>)


522it [00:04, 117.73it/s, loss=3.2e+3] 

tensor(3271.8032, device='cuda:0', grad_fn=<AddBackward0>)


571it [00:04, 117.81it/s, loss=3.16e+3]

tensor(3104.4368, device='cuda:0', grad_fn=<AddBackward0>)


619it [00:05, 116.02it/s, loss=3.1e+3] 

tensor(3271.3079, device='cuda:0', grad_fn=<AddBackward0>)


668it [00:05, 117.95it/s, loss=3.31e+3]

tensor(3017.4446, device='cuda:0', grad_fn=<AddBackward0>)


719it [00:06, 119.39it/s, loss=3.06e+3]

tensor(3194.3191, device='cuda:0', grad_fn=<AddBackward0>)


770it [00:06, 119.51it/s, loss=3.3e+3] 

tensor(3230.7910, device='cuda:0', grad_fn=<AddBackward0>)


821it [00:06, 118.72it/s, loss=3.01e+3]

tensor(2910.4106, device='cuda:0', grad_fn=<AddBackward0>)


870it [00:07, 118.72it/s, loss=3.29e+3]

tensor(3133.9426, device='cuda:0', grad_fn=<AddBackward0>)


919it [00:07, 118.79it/s, loss=3.11e+3]

tensor(3327.7737, device='cuda:0', grad_fn=<AddBackward0>)


967it [00:08, 118.52it/s, loss=3.2e+3] 

tensor(3052.9561, device='cuda:0', grad_fn=<AddBackward0>)


1015it [00:08, 118.14it/s, loss=3.31e+3]

tensor(3380.4780, device='cuda:0', grad_fn=<AddBackward0>)


1063it [00:09, 115.79it/s, loss=3.13e+3]

tensor(3214.6350, device='cuda:0', grad_fn=<AddBackward0>)


1111it [00:09, 116.69it/s, loss=3.23e+3]

tensor(3083.9233, device='cuda:0', grad_fn=<AddBackward0>)


1162it [00:09, 119.23it/s, loss=3.13e+3]

tensor(3173.6951, device='cuda:0', grad_fn=<AddBackward0>)


1211it [00:10, 115.80it/s, loss=3.17e+3]

tensor(3115.3406, device='cuda:0', grad_fn=<AddBackward0>)


1260it [00:10, 117.52it/s, loss=2.97e+3]

tensor(3192.2781, device='cuda:0', grad_fn=<AddBackward0>)


1310it [00:11, 116.33it/s, loss=3.29e+3]

tensor(3305.5903, device='cuda:0', grad_fn=<AddBackward0>)


1370it [00:11, 117.37it/s, loss=3.07e+3]

tensor(3261.4961, device='cuda:0', grad_fn=<AddBackward0>)


1420it [00:12, 117.73it/s, loss=3.16e+3]

tensor(3236.3467, device='cuda:0', grad_fn=<AddBackward0>)


1469it [00:12, 117.19it/s, loss=3.3e+3] 

tensor(3176.8784, device='cuda:0', grad_fn=<AddBackward0>)


1516it [00:13, 108.03it/s, loss=3.14e+3]

tensor(3217.1575, device='cuda:0', grad_fn=<AddBackward0>)


1564it [00:13, 115.04it/s, loss=3.22e+3]

tensor(3095.2546, device='cuda:0', grad_fn=<AddBackward0>)


1614it [00:13, 117.78it/s, loss=3.28e+3]

tensor(3338.4128, device='cuda:0', grad_fn=<AddBackward0>)


1663it [00:14, 117.04it/s, loss=3.22e+3]

tensor(2921.9094, device='cuda:0', grad_fn=<AddBackward0>)


1711it [00:14, 115.72it/s, loss=3.06e+3]

tensor(3183.9539, device='cuda:0', grad_fn=<AddBackward0>)


1760it [00:15, 115.77it/s, loss=3.45e+3]

tensor(3125.4778, device='cuda:0', grad_fn=<AddBackward0>)


1821it [00:15, 117.74it/s, loss=3.05e+3]

tensor(2961.7954, device='cuda:0', grad_fn=<AddBackward0>)


1871it [00:15, 119.14it/s, loss=3.19e+3]

tensor(3364.6050, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:16, 117.04it/s, loss=3.36e+3]
11it [00:00, 103.27it/s, loss=3.14e+3]

8
tensor(3049.4092, device='cuda:0', grad_fn=<AddBackward0>)


61it [00:00, 116.21it/s, loss=2.94e+3]

tensor(3120.7661, device='cuda:0', grad_fn=<AddBackward0>)


123it [00:01, 119.02it/s, loss=2.91e+3]

tensor(3132.2793, device='cuda:0', grad_fn=<AddBackward0>)


173it [00:01, 119.88it/s, loss=2.99e+3]

tensor(3292.9619, device='cuda:0', grad_fn=<AddBackward0>)


212it [00:01, 120.64it/s, loss=3.31e+3]

tensor(3308.2388, device='cuda:0', grad_fn=<AddBackward0>)


262it [00:02, 118.60it/s, loss=3.12e+3]

tensor(3170.3713, device='cuda:0', grad_fn=<AddBackward0>)


313it [00:02, 119.97it/s, loss=3.05e+3]

tensor(3342.5444, device='cuda:0', grad_fn=<AddBackward0>)


361it [00:03, 115.63it/s, loss=3.23e+3]

tensor(3127.8276, device='cuda:0', grad_fn=<AddBackward0>)


422it [00:03, 118.35it/s, loss=3.24e+3]

tensor(3162.2544, device='cuda:0', grad_fn=<AddBackward0>)


470it [00:03, 117.09it/s, loss=3.08e+3]

tensor(3252.4795, device='cuda:0', grad_fn=<AddBackward0>)


519it [00:04, 118.23it/s, loss=2.99e+3]

tensor(3056.3259, device='cuda:0', grad_fn=<AddBackward0>)


567it [00:04, 117.90it/s, loss=3.38e+3]

tensor(3067.6741, device='cuda:0', grad_fn=<AddBackward0>)


616it [00:05, 118.36it/s, loss=3.09e+3]

tensor(3357.9758, device='cuda:0', grad_fn=<AddBackward0>)


667it [00:05, 119.41it/s, loss=2.87e+3]

tensor(3346.6008, device='cuda:0', grad_fn=<AddBackward0>)


716it [00:06, 118.23it/s, loss=3.41e+3]

tensor(3154.3201, device='cuda:0', grad_fn=<AddBackward0>)


765it [00:06, 118.75it/s, loss=3.32e+3]

tensor(2885.6841, device='cuda:0', grad_fn=<AddBackward0>)


813it [00:06, 117.84it/s, loss=3.24e+3]

tensor(3416.3926, device='cuda:0', grad_fn=<AddBackward0>)


862it [00:07, 115.02it/s, loss=3.3e+3] 

tensor(3415.8616, device='cuda:0', grad_fn=<AddBackward0>)


923it [00:07, 117.95it/s, loss=3.15e+3]

tensor(3350.1045, device='cuda:0', grad_fn=<AddBackward0>)


971it [00:08, 113.09it/s, loss=3.11e+3]

tensor(3310.6589, device='cuda:0', grad_fn=<AddBackward0>)


1019it [00:08, 115.26it/s, loss=2.9e+3] 

tensor(3021.6797, device='cuda:0', grad_fn=<AddBackward0>)


1067it [00:09, 116.32it/s, loss=3.23e+3]

tensor(3271.9785, device='cuda:0', grad_fn=<AddBackward0>)


1115it [00:09, 118.03it/s, loss=3.13e+3]

tensor(3321.5405, device='cuda:0', grad_fn=<AddBackward0>)


1163it [00:09, 118.37it/s, loss=3.13e+3]

tensor(3174.9148, device='cuda:0', grad_fn=<AddBackward0>)


1212it [00:10, 118.55it/s, loss=2.76e+3]

tensor(3175.4224, device='cuda:0', grad_fn=<AddBackward0>)


1260it [00:10, 117.17it/s, loss=3.21e+3]

tensor(3294.3892, device='cuda:0', grad_fn=<AddBackward0>)


1321it [00:11, 118.26it/s, loss=3.32e+3]

tensor(3083.1438, device='cuda:0', grad_fn=<AddBackward0>)


1370it [00:11, 117.37it/s, loss=3.14e+3]

tensor(3220.8042, device='cuda:0', grad_fn=<AddBackward0>)


1418it [00:12, 117.38it/s, loss=3.14e+3]

tensor(3124.9565, device='cuda:0', grad_fn=<AddBackward0>)


1467it [00:12, 118.71it/s, loss=3.27e+3]

tensor(3426.4849, device='cuda:0', grad_fn=<AddBackward0>)


1516it [00:12, 118.57it/s, loss=2.93e+3]

tensor(3278.7109, device='cuda:0', grad_fn=<AddBackward0>)


1565it [00:13, 117.14it/s, loss=3.32e+3]

tensor(3359.7661, device='cuda:0', grad_fn=<AddBackward0>)


1613it [00:13, 117.15it/s, loss=2.89e+3]

tensor(3017.5659, device='cuda:0', grad_fn=<AddBackward0>)


1662it [00:14, 117.34it/s, loss=3.17e+3]

tensor(3161.1838, device='cuda:0', grad_fn=<AddBackward0>)


1711it [00:14, 118.22it/s, loss=3.03e+3]

tensor(3218.4961, device='cuda:0', grad_fn=<AddBackward0>)


1771it [00:15, 118.10it/s, loss=3.23e+3]

tensor(3153.1890, device='cuda:0', grad_fn=<AddBackward0>)


1819it [00:15, 118.43it/s, loss=2.95e+3]

tensor(2788.8247, device='cuda:0', grad_fn=<AddBackward0>)


1868it [00:15, 118.61it/s, loss=3.3e+3] 

tensor(3158.1104, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:15, 117.73it/s, loss=3.07e+3]
12it [00:00, 113.73it/s, loss=3.12e+3]

9
tensor(3077.4746, device='cuda:0', grad_fn=<AddBackward0>)


72it [00:00, 118.06it/s, loss=3.23e+3]

tensor(3135.4041, device='cuda:0', grad_fn=<AddBackward0>)


121it [00:01, 117.61it/s, loss=3.02e+3]

tensor(3330.1548, device='cuda:0', grad_fn=<AddBackward0>)


172it [00:01, 119.75it/s, loss=3.2e+3] 

tensor(3192.7334, device='cuda:0', grad_fn=<AddBackward0>)


223it [00:01, 119.81it/s, loss=3.36e+3]

tensor(3254.3303, device='cuda:0', grad_fn=<AddBackward0>)


272it [00:02, 117.09it/s, loss=3.29e+3]

tensor(3032.2275, device='cuda:0', grad_fn=<AddBackward0>)


322it [00:02, 118.03it/s, loss=3.07e+3]

tensor(2905.0032, device='cuda:0', grad_fn=<AddBackward0>)


370it [00:03, 117.45it/s, loss=2.93e+3]

tensor(3305.6008, device='cuda:0', grad_fn=<AddBackward0>)


420it [00:03, 118.99it/s, loss=3.09e+3]

tensor(3094.8274, device='cuda:0', grad_fn=<AddBackward0>)


468it [00:04, 118.74it/s, loss=3.27e+3]

tensor(3214.6863, device='cuda:0', grad_fn=<AddBackward0>)


517it [00:04, 118.29it/s, loss=3.37e+3]

tensor(3225.2234, device='cuda:0', grad_fn=<AddBackward0>)


565it [00:04, 115.95it/s, loss=3e+3]   

tensor(3278.1599, device='cuda:0', grad_fn=<AddBackward0>)


614it [00:05, 115.73it/s, loss=3.41e+3]

tensor(3365.2371, device='cuda:0', grad_fn=<AddBackward0>)


663it [00:05, 117.61it/s, loss=2.89e+3]

tensor(3226.7603, device='cuda:0', grad_fn=<AddBackward0>)


713it [00:06, 118.48it/s, loss=3.28e+3]

tensor(2993.1628, device='cuda:0', grad_fn=<AddBackward0>)


761it [00:06, 117.93it/s, loss=3.13e+3]

tensor(3072.6260, device='cuda:0', grad_fn=<AddBackward0>)


809it [00:07, 107.13it/s, loss=3.21e+3]

tensor(3328.5930, device='cuda:0', grad_fn=<AddBackward0>)


866it [00:07, 99.54it/s, loss=3.14e+3] 

tensor(3213.5488, device='cuda:0', grad_fn=<AddBackward0>)


911it [00:08, 102.73it/s, loss=3.34e+3]

tensor(3042.4897, device='cuda:0', grad_fn=<AddBackward0>)


961it [00:08, 115.21it/s, loss=2.95e+3]

tensor(3181.6282, device='cuda:0', grad_fn=<AddBackward0>)


1021it [00:08, 112.98it/s, loss=3.02e+3]

tensor(3299.9336, device='cuda:0', grad_fn=<AddBackward0>)


1069it [00:09, 114.30it/s, loss=3.11e+3]

tensor(3242.3809, device='cuda:0', grad_fn=<AddBackward0>)


1117it [00:09, 115.82it/s, loss=2.91e+3]

tensor(2998.6592, device='cuda:0', grad_fn=<AddBackward0>)


1166it [00:10, 114.73it/s, loss=3.19e+3]

tensor(3175.7493, device='cuda:0', grad_fn=<AddBackward0>)


1214it [00:10, 110.51it/s, loss=3.07e+3]

tensor(3426.6995, device='cuda:0', grad_fn=<AddBackward0>)


1262it [00:11, 109.36it/s, loss=3.21e+3]

tensor(3097.7329, device='cuda:0', grad_fn=<AddBackward0>)


1310it [00:11, 111.41it/s, loss=3.32e+3]

tensor(3238.7461, device='cuda:0', grad_fn=<AddBackward0>)


1370it [00:11, 113.91it/s, loss=2.8e+3] 

tensor(2956.5908, device='cuda:0', grad_fn=<AddBackward0>)


1418it [00:12, 116.70it/s, loss=2.95e+3]

tensor(2972.4180, device='cuda:0', grad_fn=<AddBackward0>)


1467it [00:12, 114.37it/s, loss=3.09e+3]

tensor(3098.4951, device='cuda:0', grad_fn=<AddBackward0>)


1515it [00:13, 114.41it/s, loss=3.17e+3]

tensor(3173.2537, device='cuda:0', grad_fn=<AddBackward0>)


1564it [00:13, 116.29it/s, loss=3.16e+3]

tensor(3208.0601, device='cuda:0', grad_fn=<AddBackward0>)


1611it [00:14, 107.27it/s, loss=3.04e+3]

tensor(3358.0300, device='cuda:0', grad_fn=<AddBackward0>)


1672it [00:14, 115.06it/s, loss=3.32e+3]

tensor(3098.3301, device='cuda:0', grad_fn=<AddBackward0>)


1720it [00:15, 113.03it/s, loss=3.07e+3]

tensor(3202.6821, device='cuda:0', grad_fn=<AddBackward0>)


1768it [00:15, 113.92it/s, loss=3.18e+3]

tensor(3199.1284, device='cuda:0', grad_fn=<AddBackward0>)


1816it [00:15, 115.84it/s, loss=3.19e+3]

tensor(3052.0754, device='cuda:0', grad_fn=<AddBackward0>)


1864it [00:16, 115.90it/s, loss=3.08e+3]

tensor(3293.8960, device='cuda:0', grad_fn=<AddBackward0>)


1875it [00:16, 114.33it/s, loss=3.35e+3]
