In [181]:
import spacy
import pandas as pd
from torchtext.data import Field, BucketIterator, TabularDataset
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import softmax

In [182]:
train = open("Info.txt", encoding = "utf8").read().lower().split('\n')

In [183]:
en = spacy.load('en_core_web_sm')

In [184]:
train

['your name is oliver',
 'you are 20 years old',
 'you have red hair',
 'you are male',
 'you live in bristol',
 'you make music',
 'you are a software engineer',
 'you are incredibly smart',
 'you are humble',
 'you are funny',
 "you don't change your bed sheets"]

In [185]:
words = pickle.load(open("6B.50_words.pkl", "rb"))

In [186]:
word2idx = pickle.load(open("6B.50_idx.pkl", "rb"))

In [187]:
def tokenize(text):
    return [word2idx[word] for word in text.split() if word in words]

In [188]:
def pad(tensor, length):
    t = np.zeros(length, dtype = np.int)
    tensor = tensor[:length]
    to_idx = tensor.shape[0]
    t[:to_idx] = tensor
    return t

In [189]:
data = np.array([pad(torch.tensor(tokenize(sent)).long(), 10) for sent in train])
data = torch.Tensor(data).long()

In [391]:
questions = open("Questions.txt", encoding = "utf8").read().lower().split('\n')

In [392]:
questions[0].split(',')

['is your name oliver-is your name danny-is your name jess', '1-0-0']

In [393]:
questions = [[segment.split('-') for segment in sample.split(',')] for sample in questions]

In [573]:
class MemoryBlock(nn.Module):
    def __init__(self, vocab_size, d_model, X):
        super().__init__()
        self.d_model = d_model
        
        self.embed = nn.Embedding(vocab_size, d_model)
        self.tokeys = nn.Linear(d_model, d_model)
        self.toqueries = nn.Linear(d_model, d_model)
        self.tovalues = nn.Linear(d_model, d_model)
        
        self.do = nn.Dropout(0.1)
        
        self.norm1 = nn.LayerNorm([d_model])
        
        self.encodedtomemory = nn.Linear(d_model, X)
        self.tomemory = nn.Linear(d_model, X)
        self.toresult = nn.Linear(X, 2)
        
        self.norm2 = nn.LayerNorm([X])
        
        self.memory = torch.rand(X, X)
        
    def forward(self, x, questions):
        #Keys Queries Values
        x = self.embed(x)
        x = self.do(x)
        keys = self.tokeys(x)
        queries = self.toqueries(x)
        values = self.tovalues(x)
        
        q = self.embed(questions)
        qkeys = self.tokeys(q)
        qqueries = self.toqueries(q)
        qvalues = self.tovalues(q)
        
        #Attention
        dot = torch.bmm(queries, keys.transpose(1, 2))/np.sqrt(self.d_model)
        attention = torch.bmm(softmax(dot, dim = 1), values)
        attention = self.norm1(attention + x)
        attention = self.encodedtomemory(torch.sum(torch.mean(attention, dim = 1), dim=0))
        attention = self.do(attention)
        attention = torch.unsqueeze(attention, 0)
        attention = torch.mm(attention.transpose(0,1), attention)
        
        #Update Memory
        tmp = torch.zeros(self.memory.shape).to(device)
        tmp[:,:] = self.memory.detach()
        self.memory = attention + tmp
        self.memory = self.norm2(self.memory)
        
        #Encode Question
        qdot = torch.mm(qqueries, qkeys.transpose(0, 1))/np.sqrt(self.d_model)
        qattention = torch.mm(softmax(qdot, dim = 1), qvalues)
        qattention = self.norm1(qattention + q)
        qattention = self.encodedtomemory(torch.mean(qattention, dim = 0))
        qattention = self.do(qattention)
        qattention = torch.unsqueeze(qattention, 0)
        result = self.toresult(torch.mm(qattention, self.memory))
        result = softmax(result, dim = 1)
        
        return result
        

In [574]:
device = "cuda" if torch.cuda.is_available() else "cpu"
mb = MemoryBlock(len(words), 10, 15).to(device)

In [575]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mb.parameters(), lr = 1e-4)

In [577]:
torch.autograd.set_detect_anomaly(True)
for e in range(100):
    for i,(q,v) in enumerate(questions):
        d = data[i]
        for j, statement in enumerate(q):
            s = np.array(pad(torch.tensor(tokenize(statement)).long(), 10))
            s = torch.Tensor(s).long().to(device)
            optimizer.zero_grad()

            label = torch.Tensor([int(v[j])]).long().to(device)
            out = mb(data.to(device), s).to(device)
            loss = criterion(out, label).to(device)
            loss.backward(retain_graph=True)
            optimizer.step()
            print(out)
            print(label)
            print(loss.item())

tensor([[0.1670, 0.8330]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4145941734313965
tensor([[0.5342, 0.4658]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.659525454044342
tensor([[0.3433, 0.6567]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8621164560317993
tensor([[0.4944, 0.5056]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6875771880149841
tensor([[0.4984, 0.5016]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6915927529335022
tensor([[0.7655, 0.2345]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4624871015548706
tensor([[0.3674, 0.6326]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8345324397087097
tensor([[0.4294, 0.5706]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6250441074371338
tensor([[0.4573, 0.5427]], device='cuda:0', grad_

tensor([[0.0281, 0.9719]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2724465131759644
tensor([[0.0317, 0.9683]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3306848406791687
tensor([[0.2388, 0.7612]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9880669713020325
tensor([[0.5172, 0.4828]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.676047682762146
tensor([[0.7832, 0.2168]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0158617496490479
tensor([[0.8056, 0.1944]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0447958707809448
tensor([[0.8116, 0.1884]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4293433725833893
tensor([[0.4209, 0.5791]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7753857374191284
tensor([[0.4724, 0.5276]], device='cuda:0', grad_

tensor([[0.4402, 0.5598]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6350923180580139
tensor([[0.7528, 0.2472]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.47192901372909546
tensor([[0.4051, 0.5949]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6027445793151855
tensor([[0.3537, 0.6463]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8500728607177734
tensor([[0.5261, 0.4739]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6673763990402222
tensor([[0.6738, 0.3262]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8820270895957947
tensor([[0.8638, 0.1362]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.1217182874679565
tensor([[0.5948, 0.4052]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6028153896331787
tensor([[0.4143, 0.5857]], device='cuda:0', gra

tensor([[0.6928, 0.3072]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9043678641319275
tensor([[0.3505, 0.6495]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5548158884048462
tensor([[0.6809, 0.3191]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5285285711288452
tensor([[0.5308, 0.4692]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7244412302970886
tensor([[0.4075, 0.5925]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7899549007415771
tensor([[0.8857, 0.1143]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.380085825920105
tensor([[0.7053, 0.2947]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9194124937057495
tensor([[0.0238, 0.9762]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.32627052068710327
tensor([[0.3801, 0.6199]], device='cuda:0', grad

tensor([[0.6491, 0.3509]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5551278591156006
tensor([[0.3159, 0.6841]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5258886814117432
tensor([[0.7057, 0.2943]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9198307394981384
tensor([[0.8200, 0.1800]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4234764873981476
tensor([[0.0399, 0.9601]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.33537667989730835
tensor([[0.0204, 0.9796]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2836647033691406
tensor([[0.5910, 0.4090]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6063172817230225
tensor([[0.1451, 0.8549]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3999764621257782
tensor([[0.0895, 0.9105]], device='cuda:0', gra

tensor([[0.1984, 0.8016]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4363640546798706
tensor([[0.5779, 0.4221]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6182926893234253
tensor([[0.2343, 0.7657]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.46233081817626953
tensor([[0.7534, 0.2466]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9782799482345581
tensor([[0.5717, 0.4283]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6240301132202148
tensor([[0.8398, 0.1602]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0895620584487915
tensor([[0.4043, 0.5957]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7934739589691162
tensor([[0.6808, 0.3192]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5286027789115906
tensor([[0.3737, 0.6263]], device='cuda:0', gra

tensor([[0.8228, 0.1772]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.42154115438461304
tensor([[0.3387, 0.6613]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5447788238525391
tensor([[0.1727, 0.8273]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.073132872581482
tensor([[0.1008, 0.8992]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.37158825993537903
tensor([[0.0304, 0.9696]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3300030529499054
tensor([[0.2273, 0.7727]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0025840997695923
tensor([[0.5392, 0.4608]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7331142425537109
tensor([[0.8049, 0.1951]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.43405476212501526
tensor([[0.4352, 0.5648]], device='cuda:0', gr

tensor([[0.4288, 0.5712]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6244375109672546
tensor([[0.8003, 0.1997]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4372519552707672
tensor([[0.3213, 0.6787]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5303182601928711
tensor([[0.5494, 0.4506]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6449723243713379
tensor([[0.7580, 0.2420]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9841009378433228
tensor([[0.5141, 0.4859]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7073606252670288
tensor([[0.3051, 0.6949]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9069070816040039
tensor([[0.7094, 0.2906]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9243704080581665
tensor([[0.3468, 0.6532]], device='cuda:0', grad

tensor([[0.0785, 0.9215]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2009328603744507
tensor([[0.7554, 0.2446]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9808658361434937
tensor([[0.0766, 0.9234]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.203558087348938
tensor([[0.3823, 0.6177]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5823346376419067
tensor([[0.2529, 0.7471]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9704097509384155
tensor([[0.5901, 0.4099]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7873498201370239
tensor([[0.5944, 0.4056]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7919644117355347
tensor([[0.7588, 0.2412]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4674493670463562
tensor([[0.2420, 0.7580]], device='cuda:0', grad_

tensor([[0.2988, 0.7012]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9144644737243652
tensor([[0.3445, 0.6555]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8606995344161987
tensor([[0.5383, 0.4617]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7321938276290894
tensor([[0.2194, 0.7806]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0125516653060913
tensor([[0.4868, 0.5132]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6800220608711243
tensor([[0.5774, 0.4226]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6187557578086853
tensor([[0.4347, 0.5653]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6299891471862793
tensor([[0.2346, 0.7654]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4625593423843384
tensor([[0.7654, 0.2346]], device='cuda:0', grad

tensor([[0.1782, 0.8218]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.42228859663009644
tensor([[0.1923, 0.8077]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0474824905395508
tensor([[0.2317, 0.7683]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9969788789749146
tensor([[0.2632, 0.7368]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4841287136077881
tensor([[0.3505, 0.6495]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8538281917572021
tensor([[0.1666, 0.8334]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4143103063106537
tensor([[0.5761, 0.4239]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6199445724487305
tensor([[0.5753, 0.4247]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7712697982788086
tensor([[0.3778, 0.6222]], device='cuda:0', gra

tensor([[0.7806, 0.2194]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.012656807899475
tensor([[0.1180, 0.8820]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3824210464954376
tensor([[0.6612, 0.3388]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.544887900352478
tensor([[0.8569, 0.1431]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.3986223340034485
tensor([[0.5359, 0.4641]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7297306656837463
tensor([[0.2089, 0.7911]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.026009202003479
tensor([[0.5628, 0.4372]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7579128742218018
tensor([[0.2808, 0.7192]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9362269639968872
tensor([[0.2124, 0.7876]], device='cuda:0', grad_fn

tensor([[0.4519, 0.5481]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6462076902389526
tensor([[0.2662, 0.7338]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.48641237616539
tensor([[0.1770, 0.8230]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4214012324810028
tensor([[0.0415, 0.9585]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2532212734222412
tensor([[0.2738, 0.7262]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9447009563446045
tensor([[0.3069, 0.6931]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.518609881401062
tensor([[0.1791, 0.8209]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0646699666976929
tensor([[0.9299, 0.0701]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.2127342224121094
tensor([[0.2286, 0.7714]], device='cuda:0', grad_fn

tensor([[0.4283, 0.5717]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7674393653869629
tensor([[0.4959, 0.5041]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6890971660614014
tensor([[0.7980, 0.2020]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0348787307739258
tensor([[0.8502, 0.1498]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.1035181283950806
tensor([[0.5493, 0.4507]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6450964212417603
tensor([[0.5088, 0.4912]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6843619346618652
tensor([[0.2403, 0.7597]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4668169617652893
tensor([[0.0761, 0.9239]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2043671607971191
tensor([[0.6601, 0.3399]], device='cuda:0', grad

tensor([[0.5538, 0.4462]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6408292055130005
tensor([[0.8094, 0.1906]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4308699369430542
tensor([[0.2510, 0.7490]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.47484081983566284
tensor([[0.0239, 0.9761]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3263392746448517
tensor([[0.4198, 0.5802]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6161695718765259
tensor([[0.9804, 0.0196]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.3239663541316986
tensor([[0.8202, 0.1798]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4233781397342682
tensor([[0.4412, 0.5588]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6360465288162231
tensor([[0.6031, 0.3969]], device='cuda:0', gra

tensor([[0.4379, 0.5621]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6329862475395203
tensor([[0.7033, 0.2967]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5103824138641357
tensor([[0.8491, 0.1509]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.40379390120506287
tensor([[0.5752, 0.4248]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7711682319641113
tensor([[0.9412, 0.0588]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.2287321090698242
tensor([[0.6235, 0.3765]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8242802023887634
tensor([[0.9554, 0.0446]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.3380465805530548
tensor([[0.8571, 0.1429]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.3985286355018616
tensor([[0.6773, 0.3227]], device='cuda:0', gra

tensor([[0.4594, 0.5406]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6533304452896118
tensor([[0.3902, 0.6098]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5893871784210205
tensor([[0.1943, 0.8057]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.044843316078186
tensor([[0.6363, 0.3637]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5660947561264038
tensor([[0.8915, 0.1085]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.1593639850616455
tensor([[0.3876, 0.6124]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5870477557182312
tensor([[0.6072, 0.3928]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8061181306838989
tensor([[0.3162, 0.6838]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8936889171600342
tensor([[0.3054, 0.6946]], device='cuda:0', grad_

tensor([[0.3992, 0.6008]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5973776578903198
tensor([[0.7191, 0.2809]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9361161589622498
tensor([[0.7554, 0.2446]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9807829856872559
tensor([[0.4202, 0.5798]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7761520743370056
tensor([[0.2805, 0.7195]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9365021586418152
tensor([[0.8134, 0.1866]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.054843544960022
tensor([[0.6269, 0.3731]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8280652761459351
tensor([[0.2326, 0.7674]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4611128568649292
tensor([[0.2558, 0.7442]], device='cuda:0', grad_

tensor([[0.6445, 0.3555]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5590657591819763
tensor([[0.6096, 0.3904]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8087483644485474
tensor([[0.3770, 0.6230]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5776955485343933
tensor([[0.4824, 0.5176]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6757038235664368
tensor([[0.1574, 0.8426]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0932936668395996
tensor([[0.4453, 0.5547]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7493662238121033
tensor([[0.4095, 0.5905]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6067714691162109
tensor([[0.4012, 0.5988]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5992349982261658
tensor([[0.4223, 0.5777]], device='cuda:0', grad

tensor([[0.9582, 0.0418]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.3364182412624359
tensor([[0.6970, 0.3030]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5154122114181519
tensor([[0.3249, 0.6751]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5333307981491089
tensor([[0.4749, 0.5251]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6683353185653687
tensor([[0.8090, 0.1910]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0491749048233032
tensor([[0.1985, 0.8015]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0394057035446167
tensor([[0.4104, 0.5896]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7867480516433716
tensor([[0.2193, 0.7807]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4513179063796997
tensor([[0.6962, 0.3038]], device='cuda:0', grad

tensor([[0.3952, 0.6048]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5938068628311157
tensor([[0.3603, 0.6397]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8425248861312866
tensor([[0.5572, 0.4428]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6376065611839294
tensor([[0.8778, 0.1222]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.140718698501587
tensor([[0.2113, 0.7887]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.44554954767227173
tensor([[0.2064, 0.7936]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4420448839664459
tensor([[0.3064, 0.6936]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9053490161895752
tensor([[0.3755, 0.6245]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8254125714302063
tensor([[0.3523, 0.6477]], device='cuda:0', grad

tensor([[0.6465, 0.3535]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5573337078094482
tensor([[0.7726, 0.2274]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0024033784866333
tensor([[0.7787, 0.2213]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4527595043182373
tensor([[0.6638, 0.3362]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5427343249320984
tensor([[0.5500, 0.4500]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7444449663162231
tensor([[0.3460, 0.6540]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5509952306747437
tensor([[0.0511, 0.9489]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3418044149875641
tensor([[0.2112, 0.7888]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0231032371520996
tensor([[0.2202, 0.7798]], device='cuda:0', grad

tensor([[0.4755, 0.5245]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6689150333404541
tensor([[0.8580, 0.1420]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.39787358045578003
tensor([[0.2392, 0.7608]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4659859836101532
tensor([[0.3020, 0.6980]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9106243252754211
tensor([[0.2134, 0.7866]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.020207405090332
tensor([[0.8701, 0.1299]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.1302248239517212
tensor([[0.5602, 0.4398]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7551129460334778
tensor([[0.5232, 0.4768]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7165950536727905
tensor([[0.4471, 0.5529]], device='cuda:0', grad

tensor([[0.6022, 0.3978]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.80057692527771
tensor([[0.3549, 0.6451]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5585525631904602
tensor([[0.1896, 0.8104]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0509662628173828
tensor([[0.0702, 0.9298]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3530198633670807
tensor([[0.0754, 0.9246]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.205368995666504
tensor([[0.3571, 0.6429]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8462797999382019
tensor([[0.5606, 0.4394]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7555409669876099
tensor([[0.6826, 0.3174]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8923723697662354
tensor([[0.4105, 0.5895]], device='cuda:0', grad_fn

tensor([[0.1514, 0.8486]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.101352334022522
tensor([[0.3219, 0.6781]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5307946801185608
tensor([[0.2087, 0.7913]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4436551630496979
tensor([[0.5096, 0.4904]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6835446953773499
tensor([[0.5539, 0.4461]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7485460042953491
tensor([[0.6504, 0.3496]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5540062189102173
tensor([[0.4879, 0.5121]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7052777409553528
tensor([[0.5604, 0.4396]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7553192377090454
tensor([[0.1954, 0.8046]], device='cuda:0', grad_

tensor([[0.6319, 0.3681]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5698893070220947
tensor([[0.2586, 0.7414]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9633702039718628
tensor([[0.5369, 0.4631]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7307301759719849
tensor([[0.4638, 0.5362]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6575652360916138
tensor([[0.8707, 0.1293]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.38963425159454346
tensor([[0.3176, 0.6824]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5273088812828064
tensor([[0.1635, 0.8365]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0851755142211914
tensor([[0.2744, 0.7256]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9439859390258789
tensor([[0.3315, 0.6685]], device='cuda:0', gra

tensor([[0.0830, 0.9170]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.1946349143981934
tensor([[0.3596, 0.6404]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8434045314788818
tensor([[0.4892, 0.5108]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7039633989334106
tensor([[0.5241, 0.4759]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7174947261810303
tensor([[0.5180, 0.4820]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7113513946533203
tensor([[0.8350, 0.1650]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.41326475143432617
tensor([[0.1589, 0.8411]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4091012179851532
tensor([[0.1627, 0.8373]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0863406658172607
tensor([[0.2168, 0.7832]], device='cuda:0', gra

tensor([[0.4114, 0.5886]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6084508895874023
tensor([[0.0997, 0.9003]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.1715158224105835
tensor([[0.0957, 0.9043]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.1771106719970703
tensor([[0.0586, 0.9414]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2289056777954102
tensor([[0.4406, 0.5594]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6354963183403015
tensor([[0.3813, 0.6187]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5814822316169739
tensor([[0.5776, 0.4224]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6185564398765564
tensor([[0.3271, 0.6729]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5351270437240601
tensor([[0.7671, 0.2329]], device='cuda:0', grad

tensor([[0.2286, 0.7714]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4581027030944824
tensor([[0.3396, 0.6604]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5455716252326965
tensor([[0.6265, 0.3735]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5746020674705505
tensor([[0.7625, 0.2375]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.46469825506210327
tensor([[0.5212, 0.4788]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6721594929695129
tensor([[0.0335, 0.9665]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.33172717690467834
tensor([[0.2612, 0.7388]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4825740456581116
tensor([[0.1619, 0.8381]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0872972011566162
tensor([[0.0260, 0.9740]], device='cuda:0', gr

tensor([[0.8011, 0.1989]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.436711847782135
tensor([[0.3107, 0.6893]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5216585397720337
tensor([[0.4703, 0.5297]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6638840436935425
tensor([[0.2341, 0.7659]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9940112829208374
tensor([[0.3717, 0.6283]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.829600989818573
tensor([[0.5828, 0.4172]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6137726306915283
tensor([[0.6030, 0.3970]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.801464855670929
tensor([[0.5199, 0.4801]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7132343053817749
tensor([[0.3006, 0.6994]], device='cuda:0', grad_fn

tensor([[0.7563, 0.2437]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4693545699119568
tensor([[0.1713, 0.8287]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0748627185821533
tensor([[0.9526, 0.0474]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.2448086738586426
tensor([[0.1632, 0.8368]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.41201767325401306
tensor([[0.2240, 0.7760]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0068175792694092
tensor([[0.9292, 0.0708]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.35334649682044983
tensor([[0.9268, 0.0732]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.35481011867523193
tensor([[0.7823, 0.2177]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.014735460281372
tensor([[0.7349, 0.2651]], device='cuda:0', gr

tensor([[0.5515, 0.4485]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7459661364555359
tensor([[0.7317, 0.2683]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.48803961277008057
tensor([[0.7357, 0.2643]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4849390983581543
tensor([[0.6257, 0.3743]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8266810178756714
tensor([[0.6031, 0.3969]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8015660047531128
tensor([[0.6718, 0.3282]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5360740423202515
tensor([[0.7100, 0.2900]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5050653219223022
tensor([[0.6298, 0.3702]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5717594027519226
tensor([[0.1682, 0.8318]], device='cuda:0', gra

tensor([[0.2518, 0.7482]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4754674732685089
tensor([[0.7329, 0.2671]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9528810381889343
tensor([[0.6964, 0.3036]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.515876054763794
tensor([[0.4228, 0.5772]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7732877731323242
tensor([[0.4337, 0.5663]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6290774345397949
tensor([[0.0520, 0.9480]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3423300087451935
tensor([[0.5143, 0.4857]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6789650917053223
tensor([[0.4437, 0.5563]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7510697245597839
tensor([[0.4216, 0.5784]], device='cuda:0', grad_

tensor([[0.3107, 0.6893]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9002346992492676
tensor([[0.5685, 0.4315]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7639471292495728
tensor([[0.4872, 0.5128]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6804547905921936
tensor([[0.2982, 0.7018]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9151145815849304
tensor([[0.2499, 0.7501]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9742396473884583
tensor([[0.9214, 0.0786]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.2008377313613892
tensor([[0.5085, 0.4915]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7016879320144653
tensor([[0.2568, 0.7432]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9655882120132446
tensor([[0.4785, 0.5215]], device='cuda:0', grad

tensor([[0.3284, 0.6716]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8794345855712891
tensor([[0.7022, 0.2978]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5112447142601013
tensor([[0.5789, 0.4211]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7751679420471191
tensor([[0.1505, 0.8495]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.40349191427230835
tensor([[0.7028, 0.2972]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5107448101043701
tensor([[0.5308, 0.4692]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6628310680389404
tensor([[0.3682, 0.6318]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5700345039367676
tensor([[0.5613, 0.4387]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7563128471374512
tensor([[0.3715, 0.6285]], device='cuda:0', gra

tensor([[0.2262, 0.7738]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4563958942890167
tensor([[0.4859, 0.5141]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7073942422866821
tensor([[0.5718, 0.4282]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.623936116695404
tensor([[0.7428, 0.2572]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9651455879211426
tensor([[0.5771, 0.4229]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7732179164886475
tensor([[0.5491, 0.4509]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6452907919883728
tensor([[0.5827, 0.4173]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6138629913330078
tensor([[0.5161, 0.4839]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7093862891197205
tensor([[0.0985, 0.9015]], device='cuda:0', grad_

tensor([[0.3794, 0.6206]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8210400342941284
tensor([[0.1930, 0.8070]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4325338900089264
tensor([[0.5499, 0.4501]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6445249319076538
tensor([[0.3574, 0.6426]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8458828330039978
tensor([[0.5079, 0.4921]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7010709643363953
tensor([[0.6699, 0.3301]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.877403974533081
tensor([[0.6912, 0.3088]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5201051235198975
tensor([[0.6031, 0.3969]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5953376293182373
tensor([[0.4116, 0.5884]], device='cuda:0', grad_

tensor([[0.5263, 0.4737]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7197898030281067
tensor([[0.6007, 0.3993]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5975356101989746
tensor([[0.9009, 0.0991]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.1723839044570923
tensor([[0.6934, 0.3066]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5183407068252563
tensor([[0.1704, 0.8296]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0761361122131348
tensor([[0.4004, 0.5996]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5984840393066406
tensor([[0.4105, 0.5895]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6076728105545044
tensor([[0.5065, 0.4935]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6866883635520935
tensor([[0.5904, 0.4096]], device='cuda:0', grad

tensor([[0.4132, 0.5868]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6100923418998718
tensor([[0.2273, 0.7727]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4571745991706848
tensor([[0.5092, 0.4908]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.684015691280365
tensor([[0.6463, 0.3537]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8501406311988831
tensor([[0.5135, 0.4865]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6797553300857544
tensor([[0.2694, 0.7306]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9501379728317261
tensor([[0.3971, 0.6029]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5954949855804443
tensor([[0.6284, 0.3716]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8297533988952637
tensor([[0.2465, 0.7535]], device='cuda:0', grad_

tensor([[0.6583, 0.3417]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.547289252281189
tensor([[0.4710, 0.5290]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6645824909210205
tensor([[0.5047, 0.4953]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6978156566619873
tensor([[0.4689, 0.5311]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.724712073802948
tensor([[0.3020, 0.6980]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.514626145362854
tensor([[0.0612, 0.9388]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.2252576351165771
tensor([[0.6946, 0.3054]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5173817873001099
tensor([[0.5263, 0.4737]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7198293209075928
tensor([[0.3157, 0.6843]], device='cuda:0', grad_fn

tensor([[0.1662, 0.8338]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.41404780745506287
tensor([[0.3437, 0.6563]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8615595698356628
tensor([[0.5023, 0.4977]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.695422887802124
tensor([[0.4493, 0.5507]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6436910629272461
tensor([[0.4650, 0.5350]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7287098169326782
tensor([[0.4280, 0.5720]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.623764157295227
tensor([[0.7379, 0.2621]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.48331135511398315
tensor([[0.5996, 0.4004]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5985349416732788
tensor([[0.5794, 0.4206]], device='cuda:0', grad

tensor([[0.5325, 0.4675]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6611980199813843
tensor([[0.6288, 0.3712]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.830254316329956
tensor([[0.4741, 0.5259]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7194126844406128
tensor([[0.4547, 0.5453]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6488332748413086
tensor([[0.7601, 0.2399]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9866394996643066
tensor([[0.4084, 0.5916]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7888860702514648
tensor([[0.4117, 0.5883]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6087689399719238
tensor([[0.6185, 0.3815]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5816161632537842
tensor([[0.5570, 0.4430]], device='cuda:0', grad_

tensor([[0.1649, 0.8351]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.41313937306404114
tensor([[0.0817, 0.9183]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.1964812278747559
tensor([[0.7378, 0.2622]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9589849710464478
tensor([[0.5969, 0.4031]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6009017825126648
tensor([[0.2229, 0.7771]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4539805054664612
tensor([[0.4036, 0.5964]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6013711094856262
tensor([[0.4316, 0.5684]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7638814449310303
tensor([[0.1005, 0.8995]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.37143394351005554
tensor([[0.0869, 0.9131]], device='cuda:0', gr

tensor([[0.5764, 0.4236]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6197051405906677
tensor([[0.4615, 0.5385]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6554250717163086
tensor([[0.5496, 0.4504]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6447455286979675
tensor([[0.2739, 0.7261]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4924151301383972
tensor([[0.1612, 0.8388]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.0882349014282227
tensor([[0.2187, 0.7813]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.45087313652038574
tensor([[0.4166, 0.5834]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6132557988166809
tensor([[0.4297, 0.5703]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7659059762954712
tensor([[0.6360, 0.3640]], device='cuda:0', gra

tensor([[0.1068, 0.8932]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.1617745161056519
tensor([[0.3920, 0.6080]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8069334030151367
tensor([[0.2202, 0.7798]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4519687294960022
tensor([[0.4954, 0.5046]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.697736382484436
tensor([[0.5047, 0.4953]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6978564262390137
tensor([[0.4325, 0.5675]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7629373669624329
tensor([[0.1783, 0.8217]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4223414957523346
tensor([[0.0943, 0.9057]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.3675767779350281
tensor([[0.2689, 0.7311]], device='cuda:0', grad_

tensor([[0.4543, 0.5457]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6484936475753784
tensor([[0.7979, 0.2021]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4389920234680176
tensor([[0.8784, 0.1216]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.38467949628829956
tensor([[0.4858, 0.5142]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6790263056755066
tensor([[0.5911, 0.4089]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6061601638793945
tensor([[0.5342, 0.4658]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7279273271560669
tensor([[0.3924, 0.6076]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8065474033355713
tensor([[0.4477, 0.5523]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6421966552734375
tensor([[0.4354, 0.5646]], device='cuda:0', gra

tensor([[0.7195, 0.2805]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.9365468621253967
tensor([[0.2087, 0.7913]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.44370585680007935
tensor([[0.3169, 0.6831]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8929375410079956
tensor([[0.0578, 0.9422]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
1.230125904083252
tensor([[0.3746, 0.6254]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.5756077170372009
tensor([[0.3101, 0.6899]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9009901285171509
tensor([[0.4199, 0.5801]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6162451505661011
tensor([[0.4512, 0.5488]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7431292533874512
tensor([[0.3458, 0.6542]], device='cuda:0', grad

tensor([[0.4390, 0.5610]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6339923143386841
tensor([[0.5255, 0.4745]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.7189568281173706
tensor([[0.4587, 0.5413]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.652745246887207
tensor([[0.6842, 0.3158]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5258410573005676
tensor([[0.4926, 0.5074]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7005773782730103
tensor([[0.8385, 0.1615]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0878944396972656
tensor([[0.7901, 0.2099]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.44457924365997314
tensor([[0.5940, 0.4060]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.791511595249176
tensor([[0.2954, 0.7046]], device='cuda:0', grad_

tensor([[0.7457, 0.2543]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4772982895374298
tensor([[0.5670, 0.4330]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.762410044670105
tensor([[0.8011, 0.1989]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.038935661315918
tensor([[0.6570, 0.3430]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8623933792114258
tensor([[0.6118, 0.3882]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5876031517982483
tensor([[0.4619, 0.5381]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.7319700717926025
tensor([[0.6221, 0.3779]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.822667121887207
tensor([[0.6128, 0.3872]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5866767168045044
tensor([[0.3180, 0.6820]], device='cuda:0', grad_fn

tensor([[0.6004, 0.3996]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.5977908372879028
tensor([[0.2985, 0.7015]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9148232936859131
tensor([[0.2779, 0.7221]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4955275058746338
tensor([[0.4274, 0.5726]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.6231424808502197
tensor([[0.1968, 0.8032]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.4352128207683563
tensor([[0.2447, 0.7553]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9807237982749939
tensor([[0.8248, 0.1752]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.42018866539001465
tensor([[0.6630, 0.3370]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8693954944610596
tensor([[0.7078, 0.2922]], device='cuda:0', gra

tensor([[0.2565, 0.7435]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.47897306084632874
tensor([[0.2879, 0.7121]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.9275524616241455
tensor([[0.3758, 0.6242]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.8250769376754761
tensor([[0.6746, 0.3254]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8829643726348877
tensor([[0.8367, 0.1633]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
1.0855120420455933
tensor([[0.6671, 0.3329]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([1], device='cuda:0')
0.8741554021835327
tensor([[0.5843, 0.4157]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.6123626828193665
tensor([[0.8456, 0.1544]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([0], device='cuda:0')
0.4060959219932556
tensor([[0.7266, 0.2734]], device='cuda:0', gra