In [435]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F


import random
import math
import time
import pandas as pd
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader

In [436]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [437]:
train = pd.read_csv('/Users/hesu/Documents/KT/riiid/train_1M.csv',
                   usecols = [1,2,3,4,7,8,9],
                   dtype={'timestamp':'int64',
                         'used_id':'int32',
                         'content_id':'int16',
                         'content_type_id':'int8',
                         'answered_correctly':'int8',
                         'prior_question_elapsed_time':'float32',
                         'prior_question_had_explanation':'boolean'})

train = train[train.content_type_id == False]

train = train.sort_values(['timestamp'],ascending=True).reset_index(drop=True)
train.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,0,1,,
1,0,7022747,5558,0,0,,
2,0,7023662,4626,0,1,,
3,0,7025965,7900,0,1,,
4,0,7029547,4449,0,1,,
5,0,579346,7900,0,1,,
6,0,7039142,5458,0,1,,
7,0,581706,4565,0,1,,
8,0,7042700,7900,0,1,,
9,0,20042606,7900,0,0,,


In [438]:
question = pd.read_csv('/Users/hesu/Documents/KT/riiid/questions.csv')
question.head(10)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
5,5,5,2,1,131 149 162 81
6,6,6,2,1,10 94 162 92
7,7,7,0,1,61 110 162 29
8,8,8,3,1,131 13 162 92
9,9,9,3,1,10 164 81


In [439]:
train_ques = pd.merge(train, question, left_on='content_id',right_on='question_id', how='left')
train_ques.drop('content_id',axis=1,inplace=True)
train_ques.head(10)

Unnamed: 0,timestamp,user_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,0,115,0,1,,,5692,5692,3,5,151
1,0,7022747,0,0,,,5558,5558,1,5,125
2,0,7023662,0,1,,,4626,4626,2,5,79
3,0,7025965,0,1,,,7900,7900,0,1,131 93 81
4,0,7029547,0,1,,,4449,4449,0,5,156
5,0,579346,0,1,,,7900,7900,0,1,131 93 81
6,0,7039142,0,1,,,5458,5458,1,5,125
7,0,581706,0,1,,,4565,4565,0,5,8
8,0,7042700,0,1,,,7900,7900,0,1,131 93 81
9,0,20042606,0,0,,,7900,7900,0,1,131 93 81


In [440]:
train_ques.tail(10)

Unnamed: 0,timestamp,user_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
980082,76809860397,4508124,0,1,35000.0,True,3016,3015,0,4,157 171 92
980083,76809860397,4508124,0,1,35000.0,True,3015,3015,2,4,136 171 92
980084,76810038254,4508124,0,1,32666.0,True,3068,3066,3,4,113 12 162 38
980085,76810038254,4508124,0,0,32666.0,True,3067,3066,0,4,74 12 162 38
980086,76810038254,4508124,0,1,32666.0,True,3066,3066,1,4,106 12 162 38
980087,78091996556,4508124,0,1,28666.0,True,7398,7396,2,7,97 160 16 35 122
980088,78091996556,4508124,0,0,28666.0,True,7399,7396,0,7,97 160 16 35 122
980089,78091996556,4508124,0,1,28666.0,True,7397,7396,1,7,18 160 16 35 122
980090,78091996556,4508124,0,0,28666.0,True,7396,7396,1,7,39 160 16 35 122
980091,78091996556,4508124,0,1,28666.0,True,7400,7396,1,7,145 160 16 35 122


In [441]:
elapsed_mean = train_ques.prior_question_elapsed_time.mean()

In [442]:
train_ques['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)
train_ques['part'].fillna(4, inplace=True)

In [443]:
train_ques.loc[:,'prior_question_elapsed_time'].value_counts()

17000.0     50744
16000.0     46949
18000.0     46550
19000.0     39580
15000.0     35889
            ...  
135200.0        1
121750.0        1
119250.0        1
150200.0        1
99333.0         1
Name: prior_question_elapsed_time, Length: 1660, dtype: int64

In [444]:
train_ques.loc[:,'part'].value_counts()

5    403239
2    190731
6    108567
3     82175
4     75997
1     69411
7     49972
Name: part, dtype: int64

In [445]:
import datetime
import time
def convert_time_to_yearMonthDay(timeStamp):
    timeStamp = timeStamp /1000.0
    timearr = time.localtime(timeStamp)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timearr)
    print(otherStyleTime)

convert_time_to_yearMonthDay(78091996556)

1972-06-23 04:13:16


In [446]:
def get_elapsed_time(ela):
    ela = ela // 1000
    if ela > 300:
        return 300
    else:
        return int(ela)

In [447]:
train_ques['prior_question_elapsed_time'] = train_ques['prior_question_elapsed_time'].apply(lambda x: get_elapsed_time(x))

In [448]:
train_ques.head(10)

Unnamed: 0,timestamp,user_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,0,115,0,1,25,,5692,5692,3,5,151
1,0,7022747,0,0,25,,5558,5558,1,5,125
2,0,7023662,0,1,25,,4626,4626,2,5,79
3,0,7025965,0,1,25,,7900,7900,0,1,131 93 81
4,0,7029547,0,1,25,,4449,4449,0,5,156
5,0,579346,0,1,25,,7900,7900,0,1,131 93 81
6,0,7039142,0,1,25,,5458,5458,1,5,125
7,0,581706,0,1,25,,4565,4565,0,5,8
8,0,7042700,0,1,25,,7900,7900,0,1,131 93 81
9,0,20042606,0,0,25,,7900,7900,0,1,131 93 81


In [449]:
train_ques['timestamp'] = train_ques['timestamp'].astype(str)
train_ques['question_id'] = train_ques['question_id'].astype(str)
train_ques['part'] = train_ques['part'].astype(str)
train_ques['prior_question_elapsed_time'] = train_ques['prior_question_elapsed_time'].astype(str)
train_ques['answered_correctly'] = train_ques['answered_correctly'].astype(str)


In [450]:
train_user = train_ques.groupby('user_id').agg({"question_id": ','.join, 
                                                "answered_correctly":','.join,
                                                "timestamp":','.join,
                                                "part":','.join,
                                                "prior_question_elapsed_time":','.join})

In [451]:
train_user.head(10)

Unnamed: 0_level_0,question_id,answered_correctly,timestamp,part,prior_question_elapsed_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","0,56943,118363,131167,137965,157063,176092,194...","5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,37,55,19,11,5,17,17,16,16,17,22,23,21,24,22..."
124,"7900,7876,175,1278,2064,2065,2063,3363,3364,33...","1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","0,32683,62000,83632,189483,189483,189483,25879...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,26,29,26,18,18,18,33,33,33,21,21,21,21,21,2..."
2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"0,21592,49069,72254,91945,111621,134341,234605...",5252222222222222222,"25,28,17,24,20,16,16,19,18,18,20,13,13,16,15,1..."
5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","0,39828,132189,153727,169080,178049,274437,348...","5,5,2,5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,...","25,24,35,88,18,12,5,92,70,14,79,66,30,27,26,22..."
8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","0,38769,72859,116541,155537,189115,221413,2399...","5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,5,5,5,5,5,5,5,...","25,16,33,30,40,35,30,29,15,19,14,38,21,17,16,2..."
8701,"3901,6671,4963,6143,8279,3964,4002,754,1110,77...",11101000100111011,"0,17833,45872,74561,121601,141679,183773,11482...",55555552222222222,"25,13,15,24,25,44,17,39,16,18,18,18,18,22,16,1..."
12741,"5145,9691,9697,5202,4787,5695,7858,5653,5889,4...","0,1,0,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,1,0,0,...","0,22273,54323,92046,109716,132679,158477,18403...","5,5,5,5,5,5,1,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,...","25,13,18,29,35,15,21,23,23,30,23,22,22,22,22,2..."
13134,"3926,564,3865,4231,3684,3988,3968,5219,4447,61...","1,0,0,1,1,0,0,1,1,0,1,1,1,0,1,1,0,1,0,1,0,1,1,...","0,23840,46834,64749,113000,183369,218217,29783...","5,2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,2,...","25,22,18,19,13,43,65,31,5,17,22,38,24,11,20,29..."
24418,"7900,7876,175,1278,2063,2065,2064,3363,3364,33...","0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,1,...","0,24224,51020,70540,88142,88142,88142,100241,1...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,30,20,24,17,17,17,4,4,4,1,1,1,1,1,1,1,17,5,..."
24600,"7900,7876,175,1278,2063,2065,2064,3365,3363,33...","1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...","0,25379,50137,70181,148601,148601,148601,21935...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,24,23,22,18,18,18,24,24,24,22,22,22,15,15,1..."


In [452]:
train_user.shape

(3824, 5)

In [453]:
type(train_user)

pandas.core.frame.DataFrame

In [454]:
train_user

Unnamed: 0_level_0,question_id,answered_correctly,timestamp,part,prior_question_elapsed_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","0,56943,118363,131167,137965,157063,176092,194...","5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,37,55,19,11,5,17,17,16,16,17,22,23,21,24,22..."
124,"7900,7876,175,1278,2064,2065,2063,3363,3364,33...","1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","0,32683,62000,83632,189483,189483,189483,25879...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,26,29,26,18,18,18,33,33,33,21,21,21,21,21,2..."
2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"0,21592,49069,72254,91945,111621,134341,234605...",5252222222222222222,"25,28,17,24,20,16,16,19,18,18,20,13,13,16,15,1..."
5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","0,39828,132189,153727,169080,178049,274437,348...","5,5,2,5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,...","25,24,35,88,18,12,5,92,70,14,79,66,30,27,26,22..."
8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","0,38769,72859,116541,155537,189115,221413,2399...","5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,5,5,5,5,5,5,5,...","25,16,33,30,40,35,30,29,15,19,14,38,21,17,16,2..."
...,...,...,...,...,...
20913319,"6659,5675,3841,5299,5254,4706,5318,6051,174,78...","0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,1,0,0,1,1,...","0,13518,35768,64516,86907,111406,130852,367471...","5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,27,10,20,26,20,22,16,12,26,42,24,23,13,26,1..."
20913864,"4790,4422,9200,3644,9418,9805,10405,6659,6286,...",100100001110001000100,"0,29051,50530,60217,79747,98008,121888,158226,...",555555155225555555555,"25,9,21,18,6,16,15,21,33,40,31,16,7,12,22,15,4..."
20938253,"7900,7876,175,1278,2065,2063,2064,3365,3364,33...","0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,...","0,4124,115985,130714,149045,149045,149045,1608...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,2,1,4,3,3,3,3,3,3,1,1,1,1,1,1,1,40,43,13,12..."
20948951,"6040,6444,8933,8537,10471,9236,4707,9353,8969,...","0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,...","0,24764,45950,71359,95527,120065,145390,172145...","5,5,5,5,1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,...","25,9,18,18,23,21,22,23,24,24,26,14,20,32,12,33..."


In [455]:
train_user.reset_index(inplace=True)

In [456]:
train_user

Unnamed: 0,user_id,question_id,answered_correctly,timestamp,part,prior_question_elapsed_time
0,115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","0,56943,118363,131167,137965,157063,176092,194...","5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,37,55,19,11,5,17,17,16,16,17,22,23,21,24,22..."
1,124,"7900,7876,175,1278,2064,2065,2063,3363,3364,33...","1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","0,32683,62000,83632,189483,189483,189483,25879...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,26,29,26,18,18,18,33,33,33,21,21,21,21,21,2..."
2,2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"0,21592,49069,72254,91945,111621,134341,234605...",5252222222222222222,"25,28,17,24,20,16,16,19,18,18,20,13,13,16,15,1..."
3,5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","0,39828,132189,153727,169080,178049,274437,348...","5,5,2,5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,...","25,24,35,88,18,12,5,92,70,14,79,66,30,27,26,22..."
4,8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","0,38769,72859,116541,155537,189115,221413,2399...","5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,5,5,5,5,5,5,5,...","25,16,33,30,40,35,30,29,15,19,14,38,21,17,16,2..."
...,...,...,...,...,...,...
3819,20913319,"6659,5675,3841,5299,5254,4706,5318,6051,174,78...","0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,1,0,0,1,1,...","0,13518,35768,64516,86907,111406,130852,367471...","5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,27,10,20,26,20,22,16,12,26,42,24,23,13,26,1..."
3820,20913864,"4790,4422,9200,3644,9418,9805,10405,6659,6286,...",100100001110001000100,"0,29051,50530,60217,79747,98008,121888,158226,...",555555155225555555555,"25,9,21,18,6,16,15,21,33,40,31,16,7,12,22,15,4..."
3821,20938253,"7900,7876,175,1278,2065,2063,2064,3365,3364,33...","0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,...","0,4124,115985,130714,149045,149045,149045,1608...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,2,1,4,3,3,3,3,3,3,1,1,1,1,1,1,1,40,43,13,12..."
3822,20948951,"6040,6444,8933,8537,10471,9236,4707,9353,8969,...","0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,...","0,24764,45950,71359,95527,120065,145390,172145...","5,5,5,5,1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,...","25,9,18,18,23,21,22,23,24,24,26,14,20,32,12,33..."


In [457]:
train_user = train_user.rename(columns={'question_id':'question_id_seq',
                            'answered_correctly':'answered_correctly_seq',
                             'timestamp':'timestamp_seq',
                             'part':'part_seq',
                             'prior_question_elapsed_time':'prior_question_elapsed_time_seq'})

In [458]:
train_user.head(10)

Unnamed: 0,user_id,question_id_seq,answered_correctly_seq,timestamp_seq,part_seq,prior_question_elapsed_time_seq
0,115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","0,56943,118363,131167,137965,157063,176092,194...","5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...","25,37,55,19,11,5,17,17,16,16,17,22,23,21,24,22..."
1,124,"7900,7876,175,1278,2064,2065,2063,3363,3364,33...","1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","0,32683,62000,83632,189483,189483,189483,25879...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,26,29,26,18,18,18,33,33,33,21,21,21,21,21,2..."
2,2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"0,21592,49069,72254,91945,111621,134341,234605...",5252222222222222222,"25,28,17,24,20,16,16,19,18,18,20,13,13,16,15,1..."
3,5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","0,39828,132189,153727,169080,178049,274437,348...","5,5,2,5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,...","25,24,35,88,18,12,5,92,70,14,79,66,30,27,26,22..."
4,8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","0,38769,72859,116541,155537,189115,221413,2399...","5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,5,5,5,5,5,5,5,...","25,16,33,30,40,35,30,29,15,19,14,38,21,17,16,2..."
5,8701,"3901,6671,4963,6143,8279,3964,4002,754,1110,77...",11101000100111011,"0,17833,45872,74561,121601,141679,183773,11482...",55555552222222222,"25,13,15,24,25,44,17,39,16,18,18,18,18,22,16,1..."
6,12741,"5145,9691,9697,5202,4787,5695,7858,5653,5889,4...","0,1,0,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,1,1,0,0,...","0,22273,54323,92046,109716,132679,158477,18403...","5,5,5,5,5,5,1,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,...","25,13,18,29,35,15,21,23,23,30,23,22,22,22,22,2..."
7,13134,"3926,564,3865,4231,3684,3988,3968,5219,4447,61...","1,0,0,1,1,0,0,1,1,0,1,1,1,0,1,1,0,1,0,1,0,1,1,...","0,23840,46834,64749,113000,183369,218217,29783...","5,2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,2,2,2,2,2,...","25,22,18,19,13,43,65,31,5,17,22,38,24,11,20,29..."
8,24418,"7900,7876,175,1278,2063,2065,2064,3363,3364,33...","0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,1,...","0,24224,51020,70540,88142,88142,88142,100241,1...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,30,20,24,17,17,17,4,4,4,1,1,1,1,1,1,1,17,5,..."
9,24600,"7900,7876,175,1278,2063,2065,2064,3365,3363,33...","1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...","0,25379,50137,70181,148601,148601,148601,21935...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,24,23,22,18,18,18,24,24,24,22,22,22,15,15,1..."


In [459]:
def get_data_for_train_encode(train_user, seq_len):
    all_ques_seq = []
    all_ans_seq = []
    all_parts_seq = []
    all_ela_seq = []
    
    target_ques = []
    target_anss = []
    target_parts = []
    target_elas = []
    
    for row in train_user.itertuples():
        q_ids = getattr(row, 'question_id_seq').strip().split(',')
        ans_ids = getattr(row, 'answered_correctly_seq').strip().split(',')
        part_ids = getattr(row, 'part_seq').strip().split(',')
        ela_ids = getattr(row, 'prior_question_elapsed_time_seq').strip().split(',')
        
        assert len(q_ids) == len(ans_ids) == len(part_ids) == len(ela_ids)
        
        target_index = len(q_ids) - 1
        q_ids_seq = q_ids[:target_index+1]
        ans_ids_seq = ans_ids[:target_index+1]
        part_ids_seq = part_ids[:target_index+1]
        ela_ids_seq = ela_ids[:target_index+1]
        
        length = len(q_ids_seq)
        if length >= seq_len:
            q_ids_seq = q_ids_seq[-seq_len:]
            ans_ids_seq = ans_ids_seq[-seq_len:]
            part_ids_seq = part_ids_seq[-seq_len:]
            ela_ids_seq = ela_ids_seq[-seq_len:]  
                
            pad_counts = 0
        else:
            pad_counts = seq_len - length
            
        q_ids_seq = [int(float(e)) for e in q_ids_seq]
        ans_ids_seq = [int(float(e)) for e in ans_ids_seq]
        part_ids_seq = [int(float(e)) for e in part_ids_seq]
        ela_ids_seq = [int(float(e)) for e in ela_ids_seq]
            
        q_ids_seq = [13523]*pad_counts + q_ids_seq
        # question用13523表示padding位
        ans_ids_seq = [2]*pad_counts  + ans_ids_seq
        # ans用2表示padding位
        # ans因为是输入到decoder中，所以需要一个起始符号，这里选择3作为其实符号，也就是句子序列中的bos的作用
        part_ids_seq = [8]*pad_counts + part_ids_seq
        # part用8来表示padding位
        ela_ids_seq = [301]*pad_counts + ela_ids_seq
        # ela用301来表示padding位
#             print("q_ids length is:{}\n ans_ids length is:{}\n part length is:{}\n ela_ids length is:{}".format(len(q_ids_seq),len(ans_ids_seq),len(part_ids_seq),len(ela_ids_seq)))
        all_ques_seq.append(q_ids_seq)
        all_ans_seq.append(ans_ids_seq)
        all_parts_seq.append(part_ids_seq)
        all_ela_seq.append(ela_ids_seq)        
        
        target_ques.append([int(float(q_ids[-1]))])
        target_anss.append([int(float(ans_ids[-1]))])
        target_parts.append([int(float(part_ids[-1]))])
        target_elas.append([int(float(ela_ids[-1]))])


    return torch.LongTensor(all_ques_seq),\
        torch.LongTensor(all_ans_seq),\
        torch.LongTensor(all_parts_seq),\
        torch.LongTensor(all_ela_seq),\
        torch.LongTensor(target_ques),\
        torch.LongTensor(target_anss),\
        torch.LongTensor(target_parts),\
        torch.LongTensor(target_elas)
            
            

In [460]:
class Rii_dataset_train(Dataset):
    def __init__(self,train_user):
        self.df = train_user
        self.ques_seq, self.ans_seq, self.parts_seq, self.ela_seq,\
        self.trg_que, self.trg_ans, self.trg_part, self.trg_ela = get_data_for_train_encode(self.df, 100)
    def __len__(self):
        return len(self.ques_seq)
    def __getitem__(self, index):
        return self.ques_seq[index], self.ans_seq[index], self.parts_seq[index], self.ela_seq[index],\
        self.trg_que[index], self.trg_ans[index], self.trg_part[index], self.trg_ela[index]

In [461]:
test_df = pd.read_csv('/Users/hesu/Documents/KT/riiid/valid.csv')

In [462]:
test_df = test_df.loc[test_df['content_type_id'] == 0].reset_index(drop=True)
test_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)
test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].apply(lambda x: get_elapsed_time(x))

In [463]:
test_df.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,10000,2868613211,91216,1219,0,780,1,1,17,True
1,10001,2868700426,91216,1172,0,781,0,1,17,True
2,10002,2868748313,91216,230,0,782,3,1,16,True
3,10003,2874335350,91216,6469,0,764,3,0,15,True
4,10004,2912644354,91216,5250,0,783,2,0,20,True
5,10005,2912756715,91216,8191,0,784,2,0,11,True
6,10006,2912855281,91216,5156,0,785,3,0,30,True
7,10007,2912982177,91216,3641,0,786,2,1,14,True
8,10008,2913096884,91216,4409,0,787,1,0,20,True
9,10010,2913266013,91216,4292,0,790,3,1,14,True


In [464]:
question.head(10)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
5,5,5,2,1,131 149 162 81
6,6,6,2,1,10 94 162 92
7,7,7,0,1,61 110 162 29
8,8,8,3,1,131 13 162 92
9,9,9,3,1,10 164 81


In [465]:
test_df = pd.merge(test_df, question, left_on='content_id',right_on='question_id', how='left')


In [466]:
test_df.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,10000,2868613211,91216,1219,0,780,1,1,17,True,1219,1219,1,2,155 119 92 102
1,10001,2868700426,91216,1172,0,781,0,1,17,True,1172,1172,0,2,155 163 81 29
2,10002,2868748313,91216,230,0,782,3,1,16,True,230,230,3,2,143 176 29 102
3,10003,2874335350,91216,6469,0,764,3,0,15,True,6469,6469,0,5,64
4,10004,2912644354,91216,5250,0,783,2,0,20,True,5250,5250,3,5,170
5,10005,2912756715,91216,8191,0,784,2,0,11,True,8191,8191,0,5,1
6,10006,2912855281,91216,5156,0,785,3,0,30,True,5156,5156,1,5,108
7,10007,2912982177,91216,3641,0,786,2,1,14,True,3641,3641,2,5,180
8,10008,2913096884,91216,4409,0,787,1,0,20,True,4409,4409,2,5,168
9,10010,2913266013,91216,4292,0,790,3,1,14,True,4292,4292,3,5,168


In [467]:
test_df = pd.merge(test_df, train_user, on='user_id',how='left')


In [468]:
test_df.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags,question_id_seq,answered_correctly_seq,timestamp_seq,part_seq,prior_question_elapsed_time_seq
0,10000,2868613211,91216,1219,0,780,1,1,17,True,1219,1219,1,2,155 119 92 102,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
1,10001,2868700426,91216,1172,0,781,0,1,17,True,1172,1172,0,2,155 163 81 29,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
2,10002,2868748313,91216,230,0,782,3,1,16,True,230,230,3,2,143 176 29 102,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
3,10003,2874335350,91216,6469,0,764,3,0,15,True,6469,6469,0,5,64,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
4,10004,2912644354,91216,5250,0,783,2,0,20,True,5250,5250,3,5,170,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
5,10005,2912756715,91216,8191,0,784,2,0,11,True,8191,8191,0,5,1,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
6,10006,2912855281,91216,5156,0,785,3,0,30,True,5156,5156,1,5,108,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
7,10007,2912982177,91216,3641,0,786,2,1,14,True,3641,3641,2,5,180,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
8,10008,2913096884,91216,4409,0,787,1,0,20,True,4409,4409,2,5,168,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."
9,10010,2913266013,91216,4292,0,790,3,1,14,True,4292,4292,3,5,168,"7900,7876,175,1278,2065,2064,2063,3364,3363,33...","1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,...","0,23540,48745,67665,161655,161655,161655,26479...","1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,6,...","25,20,20,23,17,17,17,29,29,29,32,32,32,30,30,3..."


In [469]:
test_df.dtypes

row_id                              int64
timestamp                           int64
user_id                             int64
content_id                          int64
content_type_id                     int64
task_container_id                   int64
user_answer                         int64
answered_correctly                  int64
prior_question_elapsed_time         int64
prior_question_had_explanation     object
question_id                         int64
bundle_id                           int64
correct_answer                      int64
part                                int64
tags                               object
question_id_seq                    object
answered_correctly_seq             object
timestamp_seq                      object
part_seq                           object
prior_question_elapsed_time_seq    object
dtype: object

In [470]:
test_df['question_id_seq'].fillna('13523', inplace=True)
test_df['question_id_seq'] = test_df['question_id_seq'].astype('str')


test_df['answered_correctly_seq'].fillna('2', inplace=True)
test_df['answered_correctly_seq'] = test_df['answered_correctly_seq'].astype('str')


test_df['part_seq'].fillna('8', inplace=True)
test_df['part_seq'] = test_df['part_seq'].astype('str')


test_df['prior_question_elapsed_time_seq'].fillna('301', inplace=True)
test_df['prior_question_elapsed_time_seq'] = test_df['prior_question_elapsed_time_seq'].astype('str')



In [471]:
def pad_np(nums, pad_index):
    seq_size = 100
    
    if nums.size == 0:
        return np.array([0]*seq_size)

    if nums.size > seq_size:
        nums = nums[-seq_size:]
    else:
        pad_counts = seq_size - len(nums)
        nums = np.pad(nums,(pad_counts,0),'constant',constant_values=(pad_index,0))
        # (pad_counts, 0 )表示在左边填充pad_counts个数字，右边填充0个数字;
        # constant_values=(0,0)表示左边填充0， 右边也填充0
    return nums



def pad_seq(df):
    df['content_id'] = np.array(df['content_id'])

    
#     df['question_id_seq'] = df['question_id_seq'].apply(lambda x: np.array(x).astype(np.int16))
    df['question_id_seq'] = df['question_id_seq'].astype('str')
    df['question_id_seq'] = df['question_id_seq'].apply(lambda x: np.array(x.split(',')).astype(np.int16))
    df['question_id_seq_input'] = df.apply(lambda x: pad_np(x.question_id_seq, 13523), axis=1)
    
    df['answered_correctly_seq'] = df['answered_correctly_seq'].astype('str')
    df['answered_correctly_seq'] = df['answered_correctly_seq'].apply(lambda x: np.array(x.split(',')).astype(np.int16))
    df['answered_correctly_input'] = df.apply(lambda x: pad_np(x.answered_correctly_seq, 2), axis=1)

    df['part_seq'] = df['part_seq'].astype('str')
    df['part_seq'] = df['part_seq'].apply(lambda x: np.array(x.split(',')).astype(np.int16))
    df['part_seq_input'] = df.apply(lambda x: pad_np(x.part_seq, 8), axis=1)
    
    df['prior_question_elapsed_time_seq'] = df['prior_question_elapsed_time_seq'].astype('str')
    df['prior_question_elapsed_time_seq'] = df['prior_question_elapsed_time_seq'].apply(lambda x: np.array(x.split(',')).astype(np.int16))
    df['prior_question_elapsed_time_seq_input'] = df.apply(lambda x: pad_np(x.prior_question_elapsed_time_seq, 301), axis=1)
    
    return df
    

In [472]:
test_df['question_id_seq']

0        7900,7876,175,1278,2065,2064,2063,3364,3363,33...
1        7900,7876,175,1278,2065,2064,2063,3364,3363,33...
2        7900,7876,175,1278,2065,2064,2063,3364,3363,33...
3        7900,7876,175,1278,2065,2064,2063,3364,3363,33...
4        7900,7876,175,1278,2065,2064,2063,3364,3363,33...
                               ...                        
19646                                                13523
19647                                                13523
19648                                                13523
19649                                                13523
19650                                                13523
Name: question_id_seq, Length: 19651, dtype: object

In [473]:
test_df = pad_seq(test_df)

In [474]:
test_df.head(2)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,tags,question_id_seq,answered_correctly_seq,timestamp_seq,part_seq,prior_question_elapsed_time_seq,question_id_seq_input,answered_correctly_input,part_seq_input,prior_question_elapsed_time_seq_input
0,10000,2868613211,91216,1219,0,780,1,1,17,True,...,155 119 92 102,"[7900, 7876, 175, 1278, 2065, 2064, 2063, 3364...","[1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...","0,23540,48745,67665,161655,161655,161655,26479...","[1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, ...","[25, 20, 20, 23, 17, 17, 17, 29, 29, 29, 32, 3...","[5660, 6113, 5266, 8940, 3618, 4322, 4687, 867...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[15, 15, 7, 36, 30, 19, 11, 16, 18, 15, 9, 5, ..."
1,10001,2868700426,91216,1172,0,781,0,1,17,True,...,155 163 81 29,"[7900, 7876, 175, 1278, 2065, 2064, 2063, 3364...","[1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...","0,23540,48745,67665,161655,161655,161655,26479...","[1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, ...","[25, 20, 20, 23, 17, 17, 17, 29, 29, 29, 32, 3...","[5660, 6113, 5266, 8940, 3618, 4322, 4687, 867...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[15, 15, 7, 36, 30, 19, 11, 16, 18, 15, 9, 5, ..."


In [475]:
test_df.columns

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'question_id', 'bundle_id', 'correct_answer', 'part', 'tags',
       'question_id_seq', 'answered_correctly_seq', 'timestamp_seq',
       'part_seq', 'prior_question_elapsed_time_seq', 'question_id_seq_input',
       'answered_correctly_input', 'part_seq_input',
       'prior_question_elapsed_time_seq_input'],
      dtype='object')

In [476]:
class Rii_dataset_test(Dataset):
    def __init__(self,test_user):
        self.df = test_user
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        ques_seq = torch.from_numpy(self.df.at[index, 'question_id_seq_input']).long()
        ans_seq = torch.from_numpy(self.df.at[index, 'answered_correctly_input']).long()
        parts_seq = torch.from_numpy(self.df.at[index, 'part_seq_input']).long()
        ela_seq = torch.from_numpy(self.df.at[index, 'prior_question_elapsed_time_seq_input']).long()
        
        trg_que = torch.LongTensor([self.df.at[index,'content_id']])
        trg_ans = torch.LongTensor([self.df.at[index,'answered_correctly']])
        trg_part = torch.LongTensor([self.df.at[index,'part']])
        trg_ela = torch.LongTensor([self.df.at[index,'prior_question_elapsed_time']])

        return ques_seq, ans_seq, parts_seq, ela_seq, trg_que, trg_ans, trg_part, trg_ela

## Model

In [477]:
class Encoder(nn.Module):
    def __init__(self, 
                 que_num,
                 part_num,
                 ela_num,
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.que_embedding = nn.Embedding(que_num, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.part_embedding = nn.Embedding(part_num, hid_dim)
        self.ela_embedding = nn.Embedding(ela_num, hid_dim)
        self.ans_embedding = nn.Embedding(3, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        self.output_layer = nn.Linear(hid_dim, 2)
        self.trg_linear = nn.Linear(hid_dim, hid_dim)
        
        self.output_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.avgpool = nn.AvgPool1d(max_length)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
        
    def forward(self, src_que,src_ans,src_part,src_ela,src_mask, trg_que, trg_part, trg_ela,trg_src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, src len]
        
        batch_size = src_que.shape[0]
        src_len = src_que.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos的维度是[batch_size, src_len]，其中每个一维的都是都是[1,100]，
        # 其中unsqueeze(0)的作用是将tensor由[seq_len]维度变成[batch_size, seq_len]维
        
        que_emb = self.que_embedding(src_que)
        part_emb = self.part_embedding(src_part)
        ela_emb = self.ela_embedding(src_ela)
        ans_emb = self.ans_embedding(src_ans)
        tok_emb = que_emb+part_emb+ela_emb+ans_emb
        
        src = self.dropout((tok_emb * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        encoder_output = src

        trg_que_emb = self.que_embedding(trg_que)
        trg_part_emb = self.part_embedding(trg_part)
        trg_ela_emb = self.ela_embedding(trg_ela)
        
        trg_emb = trg_que_emb+trg_part_emb+trg_ela_emb
        trg_linear = self.trg_linear(trg_emb)
        
        print("encoder_output shape is:{}\ntrg_linear shape is:{}".format(encoder_output.shape, trg_linear.shape))
        attention_output, _ = self.output_attention(trg_linear, encoder_output, encoder_output, trg_src_mask)
#         print("src shape:{}\ntrg_que_emb shape:{}\ntrg_part_emb shape:{}\n".format(src.shape, trg_que_emb.shape, trg_part_emb.shape))
        #src = [batch size, src len, hid dim]
        print("attention output shape is:{}".format(attention_output.shape))
        
        output_pool = self.avgpool(attention_output.permute(0,2,1)).permute(0,2,1)
        
        output = self.output_layer(output_pool)
        
        return output
        

In [478]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]
                
        #self attention
#         print("In encoder Q shape is:{}\t K shape is:{}\t V shape is:{}\t mask shape is:{}".format(src.shape,\
#                                                                                                    src.shape,src.shape,src_mask.shape))
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [479]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
#         print("Q shape is:{}\t K shape is:{}\t V shape is:{}\t energy shape is:{}\tmask shapeis:{}".format(Q.shape,\
#                                                                                 K.shape, V.shape, energy.shape,mask.shape))
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention        

In [480]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

# Seq2Seq

In [481]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 src_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = src_pad_idx
        self.device = device
        
    def make_trg_src_mask(self, src):
        # 这个是trg和src中的每一个计算attention分布时用的mask
        #src = [batch size, src len]
        
        trg_src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return trg_src_mask
    
    def make_src_mask(self, src):
        # 这个是encoder部分，只能看见当前que前面que信息的mask矩阵，上三角mask矩阵
        #src = [batch size, trg len]
        
        src_pad_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #src_pad_mask = [batch size, 1, 1, src len]
        
        src_len = src.shape[1]
        
        src_sub_mask = torch.tril(torch.ones((src_len, src_len), device = self.device)).bool()
        
        #src_sub_mask = [src len, src len]
            
        src_mask = src_pad_mask & src_sub_mask
        
        #src_mask = [batch size, 1, src len, src len]
        
        return src_mask

    def forward(self, src_que,src_ans,src_part,src_ela,trg_que, trg_part,trg_ela):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
            
        src_mask = self.make_src_mask(src_que)
        trg_src_mask = self.make_src_mask(src_que)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_src_mask = [batch size, 1, trg len, trg len]
        
        output = self.encoder(src_que,src_ans,src_part,src_ela,src_mask, trg_que, trg_part, trg_ela,trg_src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        print("The model output shape is:{}".format(output.shape))
        return output       

In [482]:
que_num = 13524
ans_num = 3
part_num = 9
ela_num = 302

HID_DIM = 256
ENC_LAYERS = 3
ENC_HEADS = 8
ENC_PF_DIM = 512
ENC_DROPOUT = 0.1



device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

enc = Encoder(que_num,part_num,ela_num,
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

In [483]:
src_pad_que_idx = 13523
trg_pad_ans_idx = 2

In [484]:
model = Seq2Seq(enc, src_pad_que_idx, device).to(device)

In [485]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,478,914 trainable parameters


In [486]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [487]:
model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (que_embedding): Embedding(13524, 256)
    (pos_embedding): Embedding(100, 256)
    (part_embedding): Embedding(9, 256)
    (ela_embedding): Embedding(302, 256)
    (ans_embedding): Embedding(3, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2)

In [488]:
LEARNING_RATE = 5e-4

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [489]:
criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_ans_idx)


## Train

In [490]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    total_num = 0
    right_num = 0
    
    for i, batch in tqdm(enumerate(iterator)):
        
        batch = tuple(t.to(device) for t in batch)
        
        src_que, src_ans, src_part, src_ela, trg_que, trg_ans, trg_part, trg_ela = batch
        
        optimizer.zero_grad()
        print("src_que is:{}\nsrc_ans is:{}\ntrg_que is:{}\ntrg_ans is:{}".format(src_que.shape, src_ans.shape,trg_que.shape,trg_ans.shape))
        
        output = model(src_que, src_ans, src_part, src_ela, trg_que, trg_part, trg_ela)
        # 由于decoder预测时是错位预测，也就是用trg[t-1]去预测trg[t]，所以输入到decoder模型中的trg缺少最后一个样本的结果 
        
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
        
        output = output.squeeze(1)
        output_dim = output.shape[-1]
            
        preds = F.softmax(output, dim=-1)
        
        output = output.contiguous().view(-1, output_dim)
        trg_ans = trg_ans.contiguous().view(-1)
        # contiguous()用于判定tensor是否是连续的
        
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg_ans)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
                
        optimizer.step()
        
        epoch_loss += loss.item()
        
        preds_ind = torch.max(preds, dim=1)[1]
        right_num += (preds_ind == trg_ans).sum().item()
        total_num += len(trg_ans)
        
    return epoch_loss / len(iterator),right_num / total_num

In [491]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    total_num = 0
    right_num = 0
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            
            batch = tuple(t.to(device) for t in batch)

            src_que, src_ans, src_part, src_ela, trg_que, trg_ans, trg_part, trg_ela = batch
            output = model(src_que, src_ans, src_part, src_ela, trg_que, trg_part, trg_ela)

            output = output.squeeze(1)
            output_dim = output.shape[-1]
            
            preds = F.softmax(output, dim=-1)
        
            output = output.contiguous().view(-1, output_dim)
            trg_ans = trg_ans.contiguous().view(-1)
            # contiguous()用于判定tensor是否是连续的
        
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg_ans)
    
            epoch_loss += loss.item()
        
            preds_ind = torch.max(preds, dim=1)[1]
            right_num += (preds_ind == trg_ans).sum().item()
            total_num += len(trg_ans)
                        

        
    return epoch_loss / len(iterator),right_num / total_num

In [492]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [493]:

train_dataset = Rii_dataset_train(train_user.head(1000))
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataset = Rii_dataset_test(test_df.head(100))
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [494]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')


for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, acc_train = train(model, train_dataloader, optimizer, criterion, CLIP)
    test_loss, acc_test = evaluate(model, test_dataloader, criterion)
    end_time = time.time()
    
    print("At epoch-{}\tThe training loss is:{}\nTrain accuracy is:{}".format(epoch, train_loss, acc_train))
    print("At epoch-{}\tThe test loss is:{}\nTest accuracy is:{}".format(epoch, test_loss, acc_test))

1it [00:00,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


3it [00:00,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


5it [00:00,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


7it [00:01,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


9it [00:01,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


11it [00:01,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


13it [00:02,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


15it [00:02,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


17it [00:02,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


19it [00:02,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


21it [00:03,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


23it [00:03,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


25it [00:03,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


27it [00:04,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


29it [00:04,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


31it [00:04,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


33it [00:05,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


35it [00:05,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


37it [00:05,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


39it [00:05,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


41it [00:06,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


43it [00:06,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


45it [00:06,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


47it [00:07,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


49it [00:07,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


51it [00:07,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


53it [00:08,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


55it [00:08,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


57it [00:08,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


59it [00:09,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


61it [00:09,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


63it [00:09,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


65it [00:10,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


66it [00:10,  5.93it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:10,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:10,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:11,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:11,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:11,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:12,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:12,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:12,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


83it [00:13,  5.89it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


85it [00:13,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


87it [00:13,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


89it [00:14,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


91it [00:14,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


93it [00:14,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


95it [00:15,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


97it [00:15,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


99it [00:15,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:16,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


102it [00:16,  5.77it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:16,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


106it [00:16,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


108it [00:17,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:17,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:17,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:18,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:18,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:18,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:19,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:19,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


124it [00:19,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


125it [00:20,  5.90it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


127it [00:20,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


129it [00:20,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


131it [00:21,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


133it [00:21,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


135it [00:21,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


137it [00:22,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


139it [00:22,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


141it [00:22,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


143it [00:22,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


145it [00:23,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


147it [00:23,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


149it [00:23,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:24,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


153it [00:24,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


155it [00:24,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


157it [00:25,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


159it [00:25,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


161it [00:25,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


163it [00:25,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


165it [00:26,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


167it [00:26,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


169it [00:26,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


171it [00:27,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


173it [00:27,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


175it [00:27,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


177it [00:28,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:28,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:28,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


183it [00:28,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


185it [00:29,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:29,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


189it [00:29,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


191it [00:30,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


193it [00:30,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


195it [00:30,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


197it [00:31,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


199it [00:31,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


201it [00:31,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


203it [00:31,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


205it [00:32,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


207it [00:32,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


209it [00:32,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


211it [00:33,  6.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


213it [00:33,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


215it [00:33,  6.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:34,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


219it [00:34,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


221it [00:34,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:34,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:35,  6.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:35,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:35,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


231it [00:36,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


233it [00:36,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:36,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


237it [00:37,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


239it [00:37,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


241it [00:37,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


243it [00:38,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:38,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:38,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:39,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:39,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:39,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


255it [00:40,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


256it [00:40,  5.95it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:40,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:40,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


261it [00:41,  5.93it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:41,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:41,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


267it [00:41,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:42,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


271it [00:42,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:42,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:43,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:43,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:43,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:44,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:44,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:44,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


286it [00:45,  5.98it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


288it [00:45,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


290it [00:45,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


292it [00:46,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


293it [00:46,  5.87it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:46,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:46,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


299it [00:47,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


301it [00:47,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


302it [00:47,  6.02it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


304it [00:48,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


306it [00:48,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


308it [00:48,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


310it [00:49,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


312it [00:49,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


313it [00:49,  6.07it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


315it [00:49,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:50,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:50,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:50,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


323it [00:51,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


325it [00:51,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:51,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:52,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:52,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:52,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:52,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:53,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


339it [00:53,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:53,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:54,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:54,  6.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:54,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [00:55,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [00:55,  6.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [00:55,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


355it [00:55,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [00:56,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [00:56,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [00:56,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


363it [00:57,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


365it [00:57,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [00:57,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [00:58,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [00:58,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [00:58,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [00:59,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [00:59,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [00:59,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [00:59,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:00,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:00,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:00,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


389it [01:01,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


391it [01:01,  6.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:01,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


395it [01:02,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


397it [01:02,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


399it [01:02,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:02,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:03,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:03,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:03,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:04,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:04,  6.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:04,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:05,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:05,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:05,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:05,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:06,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:06,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:06,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:07,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:07,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:07,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:08,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:08,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:08,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:09,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:09,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


445it [01:09,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


447it [01:10,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


449it [01:10,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


451it [01:10,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


453it [01:11,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


455it [01:11,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


457it [01:11,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


459it [01:12,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


461it [01:12,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


463it [01:12,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


465it [01:13,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


467it [01:13,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


468it [01:13,  5.98it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:13,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:14,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:14,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


476it [01:14,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


478it [01:15,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:15,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:15,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


484it [01:16,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:16,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:16,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:17,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


492it [01:17,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:17,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:18,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:18,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:18,  6.34it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

2it [00:00,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


4it [00:00,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


6it [00:00,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


8it [00:01,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


12it [00:01,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


14it [00:02,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


18it [00:02,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


20it [00:03,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


24it [00:03,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


26it [00:04,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:04,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:05,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:05,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


38it [00:05,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:06,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:06,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:06,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


48it [00:07,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:07,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  6.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


54it [00:08,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


56it [00:08,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:08,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


60it [00:09,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


62it [00:09,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:09,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


66it [00:10,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:10,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:10,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:11,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:11,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:11,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:11,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:12,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:12,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:12,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:13,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


88it [00:13,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


90it [00:13,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:14,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


94it [00:14,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


96it [00:14,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:15,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


100it [00:15,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


102it [00:15,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:15,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


106it [00:16,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


108it [00:16,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:16,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:17,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:17,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:17,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:18,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:18,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:18,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


124it [00:19,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:19,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:19,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:20,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:20,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:20,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:21,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:21,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:21,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:22,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


144it [00:22,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:22,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:23,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


150it [00:23,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


152it [00:23,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


154it [00:24,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


156it [00:24,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:24,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


159it [00:24,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


161it [00:25,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


163it [00:25,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


165it [00:25,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


167it [00:26,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


169it [00:26,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


171it [00:26,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


173it [00:27,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


174it [00:27,  5.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


175it [00:27,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:27,  5.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


177it [00:28,  4.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:28,  4.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:28,  4.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:28,  4.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


182it [00:29,  4.71it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


183it [00:29,  4.96it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:29,  5.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


185it [00:29,  4.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:30,  5.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


189it [00:30,  4.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  4.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


191it [00:31,  4.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:31,  4.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


193it [00:31,  4.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:31,  4.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


195it [00:32,  4.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:32,  4.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:32,  4.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


199it [00:32,  4.80it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


200it [00:33,  4.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


201it [00:33,  4.90it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


203it [00:33,  5.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


205it [00:33,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


207it [00:34,  5.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


208it [00:34,  5.52it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


210it [00:34,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


212it [00:35,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


214it [00:35,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


215it [00:35,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:36,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


218it [00:36,  5.55it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


220it [00:36,  5.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


221it [00:36,  5.42it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:37,  5.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


224it [00:37,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


226it [00:37,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


227it [00:37,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:38,  5.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


230it [00:38,  5.43it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


232it [00:38,  5.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


233it [00:38,  5.48it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:39,  5.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


236it [00:39,  5.68it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


238it [00:39,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


240it [00:40,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


242it [00:40,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


243it [00:40,  5.96it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:40,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:41,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:41,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:41,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:42,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


254it [00:42,  5.83it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


256it [00:42,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:43,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:43,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


262it [00:43,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


264it [00:44,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


266it [00:44,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


267it [00:44,  5.75it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:44,  5.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


270it [00:45,  5.54it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


272it [00:45,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


274it [00:45,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


275it [00:45,  5.80it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:46,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:46,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:46,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:47,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:47,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:47,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:48,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:48,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:48,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:49,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:49,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


298it [00:49,  5.81it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


300it [00:50,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


301it [00:50,  5.79it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:50,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


304it [00:50,  5.85it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


306it [00:51,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


307it [00:51,  5.59it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:51,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


311it [00:52,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


313it [00:52,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


314it [00:52,  5.75it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


316it [00:52,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


317it [00:53,  5.70it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:53,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


320it [00:53,  5.59it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


322it [00:54,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


324it [00:54,  5.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


325it [00:54,  5.52it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:54,  5.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


328it [00:55,  5.61it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


330it [00:55,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


331it [00:55,  5.69it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:55,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:56,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


336it [00:56,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


338it [00:56,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


339it [00:56,  5.71it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:57,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:57,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:58,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:58,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


348it [00:58,  5.75it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


350it [00:58,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


352it [00:59,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


354it [00:59,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


356it [00:59,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


358it [01:00,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


360it [01:00,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


362it [01:00,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


364it [01:01,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


366it [01:01,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


368it [01:01,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


369it [01:02,  5.93it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [01:02,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [01:02,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [01:03,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:03,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:03,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:04,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


382it [01:04,  5.88it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


384it [01:04,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


386it [01:04,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


388it [01:05,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


390it [01:05,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


391it [01:05,  5.85it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:06,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


395it [01:06,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


397it [01:06,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


399it [01:07,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:07,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:07,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:08,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:08,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:08,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:09,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:09,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:09,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:10,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:10,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:10,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:10,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:11,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:11,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:11,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:12,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:12,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:12,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:13,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:13,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:13,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:14,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


444it [01:14,  5.51it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:14,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:15,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:15,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:15,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:16,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:16,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


458it [01:16,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


460it [01:17,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


462it [01:17,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


464it [01:17,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


466it [01:18,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:18,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


469it [01:18,  5.78it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


471it [01:19,  5.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


472it [01:19,  5.42it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:19,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


475it [01:19,  5.40it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


477it [01:20,  5.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


478it [01:20,  5.53it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:20,  5.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


481it [01:20,  5.64it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


483it [01:21,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


485it [01:21,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:21,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:22,  5.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:22,  5.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


491it [01:22,  5.42it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


493it [01:23,  5.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:23,  5.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


495it [01:23,  4.96it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:23,  4.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:24,  4.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


499it [01:24,  4.91it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:24,  5.92it/s]


encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

2it [00:00,  5.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


3it [00:00,  5.60it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


5it [00:00,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


7it [00:01,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


8it [00:01,  5.67it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  5.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


11it [00:01,  5.58it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


13it [00:02,  5.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


14it [00:02,  5.57it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  5.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


17it [00:03,  5.62it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


19it [00:03,  5.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


20it [00:03,  5.58it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  5.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


23it [00:04,  5.53it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


25it [00:04,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


27it [00:04,  5.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


28it [00:05,  5.40it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:05,  5.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


31it [00:05,  5.50it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


33it [00:05,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


34it [00:06,  5.62it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:06,  5.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


37it [00:06,  5.56it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


39it [00:07,  5.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


40it [00:07,  5.57it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:07,  5.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


43it [00:07,  5.67it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


45it [00:08,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


47it [00:08,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


49it [00:08,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


51it [00:09,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


53it [00:09,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


55it [00:09,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


56it [00:09,  5.65it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:10,  5.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


59it [00:10,  5.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


61it [00:10,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


63it [00:11,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


65it [00:11,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


67it [00:11,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


68it [00:11,  5.87it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:12,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:12,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:13,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


75it [00:13,  5.75it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


77it [00:13,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


79it [00:13,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


81it [00:14,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


83it [00:14,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


85it [00:14,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


87it [00:15,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


89it [00:15,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


91it [00:15,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


93it [00:16,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


95it [00:16,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


97it [00:16,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


99it [00:16,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:17,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


102it [00:17,  6.11it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:17,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


105it [00:18,  5.97it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


107it [00:18,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


109it [00:18,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


111it [00:18,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


113it [00:19,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


115it [00:19,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


117it [00:19,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


119it [00:20,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


121it [00:20,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


123it [00:20,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


125it [00:21,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


127it [00:21,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


129it [00:21,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


131it [00:22,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


132it [00:22,  6.06it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:22,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


135it [00:22,  5.69it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


137it [00:23,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


139it [00:23,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


141it [00:23,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


143it [00:24,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


144it [00:24,  5.80it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


145it [00:24,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:24,  5.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:25,  5.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


149it [00:25,  5.41it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:25,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


153it [00:25,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


155it [00:26,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


156it [00:26,  5.61it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:26,  5.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


159it [00:26,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


161it [00:27,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


163it [00:27,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


165it [00:28,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


166it [00:28,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


168it [00:28,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


169it [00:28,  5.61it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


171it [00:29,  5.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


172it [00:29,  5.60it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:29,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


175it [00:29,  5.65it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


177it [00:30,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:30,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:30,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


182it [00:30,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:31,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


185it [00:31,  5.71it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:31,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


188it [00:32,  5.74it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:32,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


191it [00:32,  5.83it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


193it [00:32,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


195it [00:33,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


196it [00:33,  5.74it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:33,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


199it [00:33,  5.57it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


201it [00:34,  5.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


202it [00:34,  5.51it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:34,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


206it [00:35,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


207it [00:35,  5.38it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


208it [00:35,  5.15it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


209it [00:35,  5.14it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


211it [00:36,  5.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


213it [00:36,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


215it [00:36,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:37,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


219it [00:37,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


221it [00:37,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:38,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:38,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:38,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:39,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


231it [00:39,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


233it [00:39,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:40,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


237it [00:40,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


239it [00:40,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


241it [00:41,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


243it [00:41,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


244it [00:41,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


246it [00:41,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


248it [00:42,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


250it [00:42,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


252it [00:42,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


254it [00:43,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


256it [00:43,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:43,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:44,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


262it [00:44,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


264it [00:44,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


266it [00:45,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


268it [00:45,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


270it [00:45,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


271it [00:45,  5.94it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:46,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:46,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:46,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:47,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:47,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:47,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:48,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:48,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:48,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:49,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:49,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:49,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:50,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


299it [00:50,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


301it [00:50,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:50,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:51,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


307it [00:51,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:51,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


311it [00:52,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


313it [00:52,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


314it [00:52,  5.95it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


316it [00:53,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


318it [00:53,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


320it [00:53,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


322it [00:54,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


324it [00:54,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


326it [00:54,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


328it [00:55,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


330it [00:55,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


331it [00:55,  5.95it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:55,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


334it [00:56,  5.89it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


336it [00:56,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


338it [00:56,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


339it [00:56,  5.85it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:57,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


342it [00:57,  5.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


344it [00:57,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


346it [00:58,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


347it [00:58,  5.73it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [00:58,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [00:58,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


352it [00:59,  5.77it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


354it [00:59,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


355it [00:59,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [01:00,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [01:00,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [01:00,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


362it [01:00,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


364it [01:01,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


366it [01:01,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


367it [01:01,  5.78it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [01:02,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [01:02,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [01:02,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


374it [01:02,  5.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


376it [01:03,  5.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


377it [01:03,  5.50it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:03,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:04,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:04,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:04,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:05,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


388it [01:05,  5.78it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


390it [01:05,  5.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


391it [01:05,  5.50it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:06,  5.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


394it [01:06,  5.52it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


396it [01:06,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


398it [01:07,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


400it [01:07,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


401it [01:07,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:08,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:08,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:08,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:09,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:09,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:09,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:10,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:10,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:10,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:10,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:11,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:11,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:11,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:12,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:12,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:12,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:13,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:13,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:13,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:14,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:14,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


444it [01:14,  5.56it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:15,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:15,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:15,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:16,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:16,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:16,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


457it [01:16,  5.82it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


459it [01:17,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


460it [01:17,  5.87it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


462it [01:17,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


464it [01:18,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


466it [01:18,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:18,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:18,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:19,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:19,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


476it [01:19,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


478it [01:20,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:20,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:20,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


484it [01:21,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:21,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:21,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:22,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


492it [01:22,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:22,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:23,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:23,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:23,  5.97it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

2it [00:00,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


4it [00:00,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


6it [00:00,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


8it [00:01,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


12it [00:01,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


14it [00:02,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


18it [00:02,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


20it [00:03,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


23it [00:03,  5.84it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


25it [00:04,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


26it [00:04,  5.75it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


29it [00:04,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


31it [00:05,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


32it [00:05,  5.78it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:06,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


37it [00:06,  5.75it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


39it [00:06,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


41it [00:06,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


42it [00:07,  5.77it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:07,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


47it [00:08,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


49it [00:08,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


50it [00:08,  5.66it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  5.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


53it [00:09,  5.60it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


55it [00:09,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


57it [00:09,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


59it [00:10,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


61it [00:10,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


63it [00:10,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


64it [00:10,  5.84it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


66it [00:11,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:11,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:11,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:12,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:12,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:12,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:13,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:13,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:13,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:14,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:14,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


88it [00:14,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


90it [00:15,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:15,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


94it [00:15,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


96it [00:16,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:16,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


100it [00:16,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


102it [00:16,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:17,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


106it [00:17,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


108it [00:17,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:18,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:18,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:18,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:19,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:19,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:19,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:20,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


124it [00:20,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:20,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:20,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:21,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:21,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:22,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:22,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:23,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


144it [00:23,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:23,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:24,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


150it [00:24,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


152it [00:24,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


154it [00:24,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


156it [00:25,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:25,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


160it [00:25,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


162it [00:26,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:26,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


166it [00:26,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


168it [00:27,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


170it [00:27,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


172it [00:27,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:28,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:28,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:28,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


180it [00:28,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


182it [00:29,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:29,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


186it [00:29,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


188it [00:30,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:30,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


193it [00:31,  5.84it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


195it [00:31,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


197it [00:31,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


199it [00:32,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


201it [00:32,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


203it [00:32,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


205it [00:33,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


207it [00:33,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


209it [00:33,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


211it [00:34,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


213it [00:34,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


215it [00:34,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:35,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


219it [00:35,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


221it [00:35,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:36,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:36,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:36,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:37,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


231it [00:37,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


233it [00:37,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:38,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


237it [00:38,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


239it [00:38,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


241it [00:39,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


243it [00:39,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:39,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:40,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:40,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:40,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:41,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


255it [00:41,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


257it [00:41,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


259it [00:42,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


261it [00:42,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:42,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:43,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


267it [00:43,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:43,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


271it [00:43,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:44,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:44,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:44,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:45,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:45,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:45,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:46,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:46,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:46,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:46,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:47,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:47,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:47,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


299it [00:48,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


301it [00:48,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:48,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:49,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


307it [00:49,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:49,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


311it [00:50,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


313it [00:50,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


315it [00:50,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:50,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:51,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:51,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


323it [00:51,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


325it [00:52,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:52,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:52,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:53,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:53,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:53,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:54,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


339it [00:54,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:54,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:54,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:55,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:55,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [00:55,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [00:56,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [00:56,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


355it [00:56,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [00:57,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [00:57,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [00:57,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


363it [00:58,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


365it [00:58,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [00:58,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [00:59,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [00:59,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


372it [00:59,  5.89it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


374it [00:59,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


375it [01:00,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:00,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:00,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


380it [01:00,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


382it [01:01,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


384it [01:01,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


385it [01:01,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:02,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


389it [01:02,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


390it [01:02,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


392it [01:03,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


394it [01:03,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


396it [01:03,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


398it [01:04,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


399it [01:04,  5.87it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:04,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:04,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:05,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


406it [01:05,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


408it [01:05,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


410it [01:06,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


412it [01:06,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


414it [01:06,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


415it [01:06,  5.78it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:07,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:07,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:07,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:08,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:08,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


426it [01:08,  5.75it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


428it [01:09,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


430it [01:09,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


432it [01:09,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


434it [01:10,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


435it [01:10,  5.78it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:10,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:11,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:11,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:11,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


445it [01:12,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


447it [01:12,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


449it [01:12,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


451it [01:13,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


453it [01:13,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


455it [01:13,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


457it [01:13,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


459it [01:14,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


461it [01:14,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


463it [01:14,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


465it [01:15,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


467it [01:15,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


469it [01:15,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


471it [01:16,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


473it [01:16,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


475it [01:16,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


477it [01:17,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


479it [01:17,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


481it [01:17,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


483it [01:17,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


485it [01:18,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


487it [01:18,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


489it [01:18,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


491it [01:19,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


493it [01:19,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


495it [01:19,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


497it [01:20,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


499it [01:20,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:20,  6.21it/s]


encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

2it [00:00,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


4it [00:00,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


6it [00:00,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


8it [00:01,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


12it [00:01,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


14it [00:02,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


18it [00:02,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


20it [00:03,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


24it [00:03,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


26it [00:04,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:04,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:04,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:05,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


38it [00:05,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:06,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:06,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:06,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


48it [00:07,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:07,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


54it [00:08,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


56it [00:08,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:09,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


60it [00:09,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


62it [00:09,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:10,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


65it [00:10,  6.03it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


67it [00:10,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


69it [00:10,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


71it [00:11,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


73it [00:11,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


75it [00:11,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


77it [00:12,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


78it [00:12,  5.96it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:12,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:13,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:13,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


85it [00:13,  5.86it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


87it [00:13,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


89it [00:14,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


91it [00:14,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


93it [00:14,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


95it [00:15,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


97it [00:15,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


99it [00:15,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:16,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


102it [00:16,  5.84it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:16,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


105it [00:16,  5.69it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


107it [00:17,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


108it [00:17,  5.66it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:17,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


111it [00:18,  5.67it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


113it [00:18,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


114it [00:18,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:18,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


117it [00:19,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


119it [00:19,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


120it [00:19,  5.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:19,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


123it [00:20,  5.62it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


125it [00:20,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


127it [00:20,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


128it [00:20,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:21,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:21,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:22,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:22,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:23,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


144it [00:23,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:23,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:24,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


150it [00:24,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


152it [00:24,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


154it [00:25,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


156it [00:25,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:25,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


160it [00:26,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


162it [00:26,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:26,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


166it [00:26,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


168it [00:27,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


170it [00:27,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


172it [00:27,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:28,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:28,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:28,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


180it [00:29,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


182it [00:29,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:29,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


186it [00:30,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


188it [00:30,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:30,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:31,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:31,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:31,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


200it [00:32,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


202it [00:32,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:32,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


206it [00:33,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


208it [00:33,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


210it [00:33,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


212it [00:34,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


214it [00:34,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


216it [00:34,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


218it [00:34,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


220it [00:35,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


222it [00:35,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


224it [00:35,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


226it [00:36,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


228it [00:36,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


230it [00:36,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


232it [00:37,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


234it [00:37,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


236it [00:37,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


238it [00:38,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


240it [00:38,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


242it [00:38,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


243it [00:38,  5.90it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:39,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:39,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:39,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


250it [00:40,  5.85it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


252it [00:40,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


254it [00:40,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


256it [00:41,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:41,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:41,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


262it [00:42,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])

263it [00:42,  5.85it/s]


The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:42,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


267it [00:42,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:43,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


270it [00:43,  5.86it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


272it [00:43,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


273it [00:43,  5.83it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:44,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


276it [00:44,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


278it [00:44,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


280it [00:45,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


282it [00:45,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


284it [00:45,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


286it [00:46,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


287it [00:46,  5.80it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:46,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


290it [00:46,  5.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


292it [00:47,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


293it [00:47,  5.67it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:47,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


296it [00:47,  5.68it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


298it [00:48,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


300it [00:48,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


302it [00:49,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


304it [00:49,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


306it [00:49,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


307it [00:49,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:50,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


311it [00:50,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


313it [00:50,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


315it [00:51,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:51,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:51,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:52,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


323it [00:52,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


325it [00:52,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:53,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:53,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:53,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:54,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:54,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:54,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


339it [00:54,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:55,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:55,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:55,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:56,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [00:56,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [00:56,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [00:57,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


355it [00:57,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [00:57,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [00:57,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [00:58,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


363it [00:58,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


365it [00:58,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [00:59,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [00:59,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [00:59,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [01:00,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [01:00,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:00,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:01,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:01,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:01,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:02,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:02,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


389it [01:02,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


391it [01:02,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:03,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


395it [01:03,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


397it [01:03,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


399it [01:04,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:04,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:04,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:05,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:05,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:05,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:06,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:06,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:06,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:06,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:07,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:07,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:07,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:08,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:08,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:08,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:09,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:09,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:09,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


436it [01:10,  5.97it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


438it [01:10,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


440it [01:10,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


442it [01:11,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


444it [01:11,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:11,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:12,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:12,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:12,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:13,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:13,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


458it [01:13,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


460it [01:14,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


462it [01:14,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


464it [01:14,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


466it [01:15,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:15,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:15,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:16,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:16,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


476it [01:16,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


478it [01:17,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:17,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:17,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


484it [01:18,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:18,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:18,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:19,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


492it [01:19,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:19,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:20,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:20,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:20,  6.18it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

0it [00:00, ?it/s]

The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torc

2it [00:00,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


4it [00:00,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


6it [00:00,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


8it [00:01,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


12it [00:01,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


14it [00:02,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


18it [00:02,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


20it [00:03,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


24it [00:03,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


26it [00:04,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:04,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:04,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:05,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


38it [00:05,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:06,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:06,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:06,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


48it [00:07,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:07,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


54it [00:08,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


56it [00:08,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:08,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


60it [00:09,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


62it [00:09,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:09,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


66it [00:10,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:10,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:10,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:11,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:11,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:11,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:12,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:12,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:12,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:13,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:13,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


88it [00:13,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


90it [00:14,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:14,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


94it [00:14,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


96it [00:14,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:15,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


99it [00:15,  6.07it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:15,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


103it [00:16,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


105it [00:16,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


107it [00:16,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


108it [00:16,  5.74it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:17,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:17,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:17,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:18,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:18,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


119it [00:18,  6.00it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


121it [00:19,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


123it [00:19,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


125it [00:19,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


127it [00:20,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


128it [00:20,  5.96it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:20,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:20,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:21,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:21,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:22,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


143it [00:22,  5.89it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


145it [00:23,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


147it [00:23,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


149it [00:23,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:24,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


153it [00:24,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


155it [00:24,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


157it [00:25,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


159it [00:25,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


161it [00:25,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


163it [00:26,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


165it [00:26,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


167it [00:26,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


169it [00:27,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


171it [00:27,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


173it [00:27,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


175it [00:28,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


177it [00:28,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:28,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:29,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


183it [00:29,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


185it [00:29,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:30,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


189it [00:30,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


191it [00:30,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


193it [00:31,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


195it [00:31,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


197it [00:31,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


199it [00:31,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


201it [00:32,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


203it [00:32,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


205it [00:32,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


207it [00:33,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


209it [00:33,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


211it [00:33,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


213it [00:34,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


215it [00:34,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:34,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


219it [00:35,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


221it [00:35,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:35,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:35,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:36,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:36,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


231it [00:36,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


233it [00:37,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:37,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


237it [00:37,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


239it [00:38,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


241it [00:38,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


243it [00:38,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:38,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:39,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:39,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:39,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:40,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


255it [00:40,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


257it [00:40,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


259it [00:41,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


261it [00:41,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:41,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:41,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


267it [00:42,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:42,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


271it [00:42,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:43,  6.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:43,  6.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:43,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:44,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:44,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:44,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:44,  6.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:45,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:45,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:45,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:46,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:46,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:46,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


299it [00:47,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


301it [00:47,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:47,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:48,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


307it [00:48,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:48,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


310it [00:49,  5.88it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


312it [00:49,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


314it [00:49,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


316it [00:50,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


318it [00:50,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


320it [00:50,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


322it [00:51,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


324it [00:51,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


326it [00:51,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


328it [00:52,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


330it [00:52,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


332it [00:52,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


334it [00:52,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


336it [00:53,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


338it [00:53,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


340it [00:53,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


342it [00:54,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


344it [00:54,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


346it [00:54,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


348it [00:55,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


350it [00:55,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


352it [00:55,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


354it [00:56,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


356it [00:56,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


358it [00:56,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


360it [00:57,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


362it [00:57,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


363it [00:57,  5.93it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


365it [00:58,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [00:58,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [00:58,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [00:59,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [00:59,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [00:59,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:00,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:00,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:00,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:00,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:01,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:01,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


389it [01:01,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


391it [01:02,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:02,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


395it [01:02,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


397it [01:03,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


399it [01:03,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:03,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:04,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:04,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:04,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:04,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:05,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:05,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:05,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:06,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:06,  6.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:06,  6.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:07,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:07,  6.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:07,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:07,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:08,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:08,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:08,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:09,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:09,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:09,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:10,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


445it [01:10,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


447it [01:10,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


449it [01:10,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


451it [01:11,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


453it [01:11,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


455it [01:11,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


457it [01:12,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


459it [01:12,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


461it [01:12,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


463it [01:13,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


465it [01:13,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


467it [01:13,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


469it [01:14,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


471it [01:14,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


473it [01:14,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


475it [01:14,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


477it [01:15,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


479it [01:15,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


481it [01:15,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


483it [01:16,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


485it [01:16,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


487it [01:16,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


489it [01:17,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


491it [01:17,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


493it [01:17,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


495it [01:18,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


497it [01:18,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


499it [01:18,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:18,  6.34it/s]


encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

1it [00:00,  6.22it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


3it [00:00,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


5it [00:00,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


7it [00:01,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


9it [00:01,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


11it [00:01,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


13it [00:02,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


15it [00:02,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


17it [00:02,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


19it [00:03,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


21it [00:03,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


23it [00:03,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


25it [00:04,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


26it [00:04,  5.87it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:05,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:05,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:06,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


38it [00:06,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:06,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:07,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:07,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


48it [00:08,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:08,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


54it [00:09,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


56it [00:09,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:09,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


60it [00:10,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


62it [00:10,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:10,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


66it [00:10,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:11,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:11,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:11,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:12,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:12,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:12,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:13,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:13,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:13,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:14,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


88it [00:14,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


90it [00:14,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:14,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


94it [00:15,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


96it [00:15,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:15,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


100it [00:16,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


102it [00:16,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:16,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


106it [00:17,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


108it [00:17,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:17,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:18,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:18,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:18,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:19,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:19,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:19,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


124it [00:19,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:20,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:20,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:20,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:21,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:21,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:22,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


141it [00:22,  5.93it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


143it [00:22,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


145it [00:23,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


147it [00:23,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


149it [00:23,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:24,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


153it [00:24,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


155it [00:24,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


157it [00:25,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


158it [00:25,  5.98it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


160it [00:25,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


162it [00:25,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:26,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


166it [00:26,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


168it [00:26,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


170it [00:27,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


172it [00:27,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:27,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:28,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


177it [00:28,  5.84it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:28,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:29,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


182it [00:29,  5.86it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:29,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


185it [00:29,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:30,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


188it [00:30,  5.75it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:30,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:31,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:31,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:32,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


200it [00:32,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


202it [00:32,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:33,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


205it [00:33,  5.72it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


207it [00:33,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


209it [00:33,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


211it [00:34,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


213it [00:34,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


215it [00:34,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


217it [00:35,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


219it [00:35,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


221it [00:35,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


223it [00:36,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:36,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:36,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:37,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


231it [00:37,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


233it [00:37,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


235it [00:38,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


237it [00:38,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


239it [00:38,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


241it [00:39,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


243it [00:39,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


245it [00:39,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


247it [00:40,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


249it [00:40,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:40,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:41,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


255it [00:41,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


257it [00:41,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


259it [00:42,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


261it [00:42,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:42,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:42,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


267it [00:43,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


269it [00:43,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


271it [00:43,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:44,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:44,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:44,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:45,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:45,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:45,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:45,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:46,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:46,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:46,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:47,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:47,  6.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


297it [00:47,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


299it [00:48,  6.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


301it [00:48,  6.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:48,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:48,  6.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


307it [00:49,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:49,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


311it [00:49,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


313it [00:50,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


315it [00:50,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:50,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:51,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:51,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


323it [00:51,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


325it [00:51,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:52,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:52,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:52,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:53,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:53,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:53,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


339it [00:54,  6.60it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:54,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:54,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:55,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:55,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [00:55,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [00:55,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [00:56,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


355it [00:56,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [00:56,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [00:57,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [00:57,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


363it [00:57,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


364it [00:58,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


366it [00:58,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


368it [00:58,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


370it [00:59,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


371it [00:59,  5.83it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [00:59,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [00:59,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:00,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:00,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:00,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:01,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:01,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:01,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


388it [01:02,  5.90it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


390it [01:02,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


392it [01:02,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


394it [01:03,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


396it [01:03,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


398it [01:03,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


400it [01:04,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


402it [01:04,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


404it [01:04,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


406it [01:05,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


408it [01:05,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


410it [01:05,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


412it [01:06,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


414it [01:06,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


416it [01:06,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


418it [01:07,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


420it [01:07,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


422it [01:07,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


424it [01:08,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


426it [01:08,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


428it [01:08,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


430it [01:09,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


432it [01:09,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


434it [01:09,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


436it [01:09,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


438it [01:10,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


440it [01:10,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


442it [01:10,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


444it [01:11,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:11,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:11,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:12,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:12,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:12,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:13,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


458it [01:13,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


460it [01:13,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


462it [01:13,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


464it [01:14,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


466it [01:14,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:14,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:15,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:15,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:15,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


476it [01:16,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


478it [01:16,  6.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:16,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:16,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


484it [01:17,  6.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:17,  6.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:17,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:18,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


492it [01:18,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:18,  6.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:19,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:19,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:19,  6.28it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

1it [00:00,  6.80it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

2it [00:00,  6.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


4it [00:00,  6.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


6it [00:00,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


8it [00:01,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


10it [00:01,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


12it [00:01,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


14it [00:02,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


16it [00:02,  6.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


18it [00:02,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


20it [00:03,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:03,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


24it [00:03,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


26it [00:03,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:04,  6.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:04,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:04,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


34it [00:05,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


36it [00:05,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


38it [00:05,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:06,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:06,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:06,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


46it [00:07,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


48it [00:07,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:07,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:08,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


54it [00:08,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


56it [00:08,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


58it [00:09,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


60it [00:09,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


62it [00:09,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:10,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


66it [00:10,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


68it [00:10,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


70it [00:10,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


72it [00:11,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


74it [00:11,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:11,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


78it [00:12,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


80it [00:12,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:12,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


84it [00:13,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:13,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


88it [00:13,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


90it [00:14,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:14,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


94it [00:14,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


96it [00:15,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:15,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


100it [00:16,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


102it [00:16,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


103it [00:16,  5.80it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


105it [00:16,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])

106it [00:17,  5.88it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


108it [00:17,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


110it [00:17,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:18,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:18,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


116it [00:18,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


118it [00:19,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:19,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


122it [00:19,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


124it [00:19,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:20,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:20,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:20,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:21,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:21,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:22,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:22,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


144it [00:23,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:23,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:23,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


150it [00:23,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


152it [00:24,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


154it [00:24,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


156it [00:24,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:25,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


160it [00:25,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


162it [00:25,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:26,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


166it [00:26,  6.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


168it [00:26,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


170it [00:27,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


172it [00:27,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:27,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:27,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:28,  6.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


180it [00:28,  6.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


182it [00:28,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:29,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


186it [00:29,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


188it [00:29,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:30,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:30,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:31,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:31,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


200it [00:31,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


202it [00:31,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:32,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


206it [00:32,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


208it [00:32,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


210it [00:33,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


212it [00:33,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


214it [00:33,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


216it [00:34,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


218it [00:34,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


220it [00:34,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


222it [00:35,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


224it [00:35,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


226it [00:35,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


228it [00:36,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


230it [00:36,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


232it [00:36,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


234it [00:37,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


236it [00:37,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


238it [00:37,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


240it [00:38,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


242it [00:38,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


244it [00:38,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


246it [00:39,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


248it [00:39,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


250it [00:39,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


252it [00:40,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


254it [00:40,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


256it [00:40,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:41,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:41,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


261it [00:41,  5.78it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:41,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


265it [00:42,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


266it [00:42,  5.74it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


268it [00:42,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


269it [00:42,  5.64it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


271it [00:43,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


273it [00:43,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:44,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


276it [00:44,  5.78it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


278it [00:44,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


280it [00:44,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


281it [00:45,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:45,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


284it [00:45,  5.74it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


286it [00:45,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


287it [00:46,  5.71it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:46,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


290it [00:46,  5.71it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


292it [00:46,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


293it [00:47,  5.82it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:47,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


296it [00:47,  5.80it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


298it [00:47,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


300it [00:48,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


301it [00:48,  5.72it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:48,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:49,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


306it [00:49,  5.84it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


308it [00:49,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


310it [00:50,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


312it [00:50,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


314it [00:50,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


316it [00:50,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


318it [00:51,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


320it [00:51,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


322it [00:51,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


324it [00:52,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


326it [00:52,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


328it [00:52,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


330it [00:53,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


332it [00:53,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


334it [00:53,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


336it [00:54,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


338it [00:54,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


340it [00:54,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


342it [00:55,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


344it [00:55,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


346it [00:55,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


348it [00:56,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


350it [00:56,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


352it [00:56,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


354it [00:56,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


356it [00:57,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


358it [00:57,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


360it [00:57,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


362it [00:58,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


364it [00:58,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


366it [00:58,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


368it [00:59,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


370it [00:59,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


372it [00:59,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


374it [01:00,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


376it [01:00,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


378it [01:00,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


380it [01:01,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


382it [01:01,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


384it [01:01,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


386it [01:01,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


388it [01:02,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


390it [01:02,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


392it [01:02,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


394it [01:03,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


396it [01:03,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


398it [01:03,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


400it [01:04,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


402it [01:04,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


404it [01:04,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


406it [01:05,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


408it [01:05,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


410it [01:05,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


412it [01:05,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


414it [01:06,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


416it [01:06,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


418it [01:06,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


420it [01:07,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


422it [01:07,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


424it [01:07,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


426it [01:08,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


427it [01:08,  5.83it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:08,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


430it [01:08,  5.68it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


432it [01:09,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:09,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:09,  5.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


436it [01:10,  5.33it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


438it [01:10,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


439it [01:10,  5.30it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:11,  5.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


442it [01:11,  5.30it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


444it [01:11,  5.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


445it [01:11,  5.11it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


446it [01:12,  5.22it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


447it [01:12,  5.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


449it [01:12,  4.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:12,  4.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


451it [01:13,  4.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


453it [01:13,  4.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:13,  4.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


455it [01:13,  5.03it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


457it [01:14,  5.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


458it [01:14,  5.43it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


460it [01:14,  5.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


461it [01:15,  5.24it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


462it [01:15,  5.22it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


463it [01:15,  5.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


465it [01:15,  5.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


466it [01:16,  5.22it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:16,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


469it [01:16,  5.14it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


470it [01:16,  4.88it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


471it [01:17,  5.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


472it [01:17,  5.11it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:17,  5.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


475it [01:17,  5.26it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


477it [01:18,  5.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


478it [01:18,  5.37it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:18,  5.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


481it [01:18,  5.53it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


483it [01:19,  5.59it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


484it [01:19,  5.63it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:19,  5.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


487it [01:19,  5.55it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


489it [01:20,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


491it [01:20,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


492it [01:20,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:21,  5.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


495it [01:21,  5.50it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


497it [01:21,  5.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


498it [01:21,  5.64it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:22,  6.08it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

1it [00:00,  4.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


2it [00:00,  4.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


3it [00:00,  4.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


5it [00:01,  4.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


6it [00:01,  4.85it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


7it [00:01,  4.89it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


9it [00:01,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


11it [00:02,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


13it [00:02,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


14it [00:02,  5.49it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


15it [00:02,  5.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


17it [00:03,  5.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


19it [00:03,  5.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


20it [00:03,  5.33it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


22it [00:04,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


24it [00:04,  5.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


25it [00:04,  5.53it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


26it [00:04,  5.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


28it [00:05,  4.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


29it [00:05,  5.08it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


30it [00:05,  5.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


32it [00:06,  5.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


33it [00:06,  5.00it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


34it [00:06,  4.93it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


35it [00:06,  4.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


37it [00:07,  4.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


38it [00:07,  5.05it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


40it [00:07,  5.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


42it [00:08,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


44it [00:08,  5.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


45it [00:08,  5.63it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


47it [00:08,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


48it [00:09,  5.48it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


50it [00:09,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


52it [00:09,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


53it [00:10,  5.61it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


55it [00:10,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


57it [00:10,  5.96it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


59it [00:11,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


61it [00:11,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


62it [00:11,  5.85it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


64it [00:11,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


65it [00:12,  5.86it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


67it [00:12,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


69it [00:12,  5.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


70it [00:12,  5.20it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


71it [00:13,  4.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


73it [00:13,  4.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


74it [00:13,  4.91it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


76it [00:14,  5.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


77it [00:14,  5.22it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


79it [00:14,  5.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


80it [00:14,  5.42it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


82it [00:15,  5.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


83it [00:15,  5.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


84it [00:15,  5.44it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


86it [00:16,  5.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


87it [00:16,  5.58it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


89it [00:16,  5.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


90it [00:16,  5.64it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


92it [00:17,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


93it [00:17,  5.55it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


95it [00:17,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


96it [00:17,  5.62it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


98it [00:18,  5.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


99it [00:18,  5.52it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:18,  5.61it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


102it [00:18,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


104it [00:19,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


106it [00:19,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


107it [00:19,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


109it [00:20,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


110it [00:20,  5.77it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


112it [00:20,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


114it [00:21,  5.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


115it [00:21,  5.53it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


117it [00:21,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


118it [00:21,  5.65it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


120it [00:22,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


121it [00:22,  5.70it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


123it [00:22,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


124it [00:22,  5.69it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:23,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:23,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:23,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:24,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:24,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:24,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:25,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:25,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:25,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


144it [00:26,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


146it [00:26,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


147it [00:26,  5.71it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


149it [00:27,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:27,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


153it [00:27,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


155it [00:28,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


157it [00:28,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


158it [00:28,  5.47it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


159it [00:28,  5.56it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


161it [00:29,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


163it [00:29,  5.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:29,  5.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


165it [00:29,  5.63it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


167it [00:30,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


169it [00:30,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


171it [00:30,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


172it [00:31,  5.68it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


173it [00:31,  5.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


174it [00:31,  4.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


175it [00:31,  4.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


176it [00:32,  4.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


177it [00:32,  4.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:32,  3.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


179it [00:32,  3.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:33,  4.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


182it [00:33,  4.72it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


184it [00:33,  5.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


185it [00:33,  5.46it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


187it [00:34,  5.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


188it [00:34,  5.64it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:34,  5.74it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:35,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:35,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:35,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:36,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


200it [00:36,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


202it [00:36,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:37,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


206it [00:37,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


208it [00:37,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


210it [00:38,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


212it [00:38,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


214it [00:38,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


216it [00:38,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


218it [00:39,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


220it [00:39,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


222it [00:39,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


223it [00:40,  6.11it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


225it [00:40,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


227it [00:40,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


229it [00:41,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


230it [00:41,  5.89it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


232it [00:41,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


234it [00:41,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


236it [00:42,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


238it [00:42,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


240it [00:42,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


242it [00:43,  6.07it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


244it [00:43,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


246it [00:43,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


248it [00:44,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])

249it [00:44,  5.92it/s]


The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


251it [00:44,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


253it [00:45,  5.99it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


255it [00:45,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


257it [00:45,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


259it [00:46,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


261it [00:46,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


263it [00:46,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


264it [00:46,  5.80it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


266it [00:47,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


268it [00:47,  5.97it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


270it [00:47,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


272it [00:48,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


273it [00:48,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


275it [00:48,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


277it [00:49,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


279it [00:49,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


281it [00:49,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


283it [00:50,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


285it [00:50,  6.04it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


287it [00:50,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:51,  6.14it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:51,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:51,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


294it [00:52,  5.91it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


296it [00:52,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


298it [00:52,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


300it [00:53,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


301it [00:53,  5.67it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:53,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


305it [00:53,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


307it [00:54,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


309it [00:54,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


310it [00:54,  5.78it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


312it [00:55,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


314it [00:55,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


315it [00:55,  5.66it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:56,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:56,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:56,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


323it [00:57,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


325it [00:57,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


327it [00:57,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:57,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:58,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:58,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:58,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:59,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


339it [00:59,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


341it [00:59,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [01:00,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [01:00,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [01:00,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


349it [01:01,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


351it [01:01,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [01:01,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


355it [01:02,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


357it [01:02,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


359it [01:02,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


361it [01:02,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


363it [01:03,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


365it [01:03,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [01:03,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [01:04,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [01:04,  6.49it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [01:04,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [01:05,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


377it [01:05,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


379it [01:05,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


381it [01:06,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


383it [01:06,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


385it [01:06,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


387it [01:07,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


389it [01:07,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


391it [01:07,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


393it [01:07,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


395it [01:08,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


397it [01:08,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


399it [01:08,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


401it [01:09,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


403it [01:09,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


405it [01:09,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


407it [01:10,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


409it [01:10,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


411it [01:10,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


413it [01:11,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


415it [01:11,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


417it [01:11,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


419it [01:12,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


421it [01:12,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


423it [01:12,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


425it [01:13,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


427it [01:13,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


429it [01:13,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


431it [01:13,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


433it [01:14,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


435it [01:14,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


437it [01:14,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


439it [01:15,  6.13it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


441it [01:15,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


443it [01:15,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


444it [01:16,  5.93it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:16,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:16,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:17,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:17,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:17,  6.08it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:18,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


457it [01:18,  5.86it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


459it [01:18,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


461it [01:19,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


463it [01:19,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


465it [01:19,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


466it [01:19,  5.74it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:20,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:20,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:20,  5.82it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


473it [01:21,  5.75it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


475it [01:21,  5.79it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


477it [01:21,  5.75it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


479it [01:22,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


480it [01:22,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:22,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


483it [01:22,  5.69it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


485it [01:23,  5.72it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


486it [01:23,  5.72it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:23,  5.65it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


489it [01:23,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


491it [01:24,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


493it [01:24,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


495it [01:24,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


497it [01:25,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


498it [01:25,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:25,  5.83it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

0it [00:00, ?it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.

1it [00:00,  6.20it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


3it [00:00,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


5it [00:00,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


7it [00:01,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


9it [00:01,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


11it [00:01,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


13it [00:02,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


15it [00:02,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


17it [00:02,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


19it [00:02,  6.37it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


21it [00:03,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


23it [00:03,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


25it [00:03,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


27it [00:04,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


29it [00:04,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


31it [00:04,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


33it [00:05,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


35it [00:05,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


37it [00:05,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


39it [00:06,  6.19it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


41it [00:06,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


43it [00:06,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


45it [00:07,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


47it [00:07,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


49it [00:07,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


51it [00:08,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


53it [00:08,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


55it [00:08,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


57it [00:09,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


59it [00:09,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


61it [00:09,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


63it [00:09,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


65it [00:10,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


67it [00:10,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


69it [00:10,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


71it [00:11,  6.32it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


73it [00:11,  6.46it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


75it [00:11,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


77it [00:12,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


79it [00:12,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


81it [00:12,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


83it [00:13,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


85it [00:13,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


87it [00:13,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


89it [00:13,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


91it [00:14,  6.62it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


93it [00:14,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


95it [00:14,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


97it [00:15,  6.56it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


99it [00:15,  6.53it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


101it [00:15,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


103it [00:16,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


105it [00:16,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


107it [00:16,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


109it [00:17,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


111it [00:17,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


113it [00:17,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


115it [00:18,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


117it [00:18,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


119it [00:18,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


121it [00:19,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


123it [00:19,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


124it [00:19,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


126it [00:19,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


128it [00:20,  5.88it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


130it [00:20,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


132it [00:20,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


134it [00:21,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


136it [00:21,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


138it [00:22,  5.90it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


140it [00:22,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


142it [00:22,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


143it [00:22,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


145it [00:23,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


146it [00:23,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


148it [00:23,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


149it [00:23,  5.90it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


151it [00:24,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


152it [00:24,  5.76it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


154it [00:24,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


156it [00:25,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


158it [00:25,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


160it [00:25,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


162it [00:26,  5.93it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


164it [00:26,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


165it [00:26,  5.75it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


167it [00:27,  5.68it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


168it [00:27,  5.69it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


170it [00:27,  5.71it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


172it [00:27,  5.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


173it [00:28,  5.69it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


175it [00:28,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


176it [00:28,  5.73it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


178it [00:28,  5.67it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


179it [00:29,  5.64it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


181it [00:29,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


183it [00:29,  5.87it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


185it [00:30,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


186it [00:30,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


188it [00:30,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


190it [00:30,  6.16it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


192it [00:31,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


194it [00:31,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


196it [00:31,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


198it [00:32,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


200it [00:32,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


202it [00:32,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


204it [00:33,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


206it [00:33,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


208it [00:33,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


210it [00:34,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


212it [00:34,  6.24it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


214it [00:34,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


216it [00:35,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


218it [00:35,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


220it [00:35,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


222it [00:36,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


224it [00:36,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


226it [00:36,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


228it [00:37,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


230it [00:37,  6.18it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


232it [00:37,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


234it [00:37,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


236it [00:38,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


238it [00:38,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


240it [00:38,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


242it [00:39,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


244it [00:39,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


246it [00:39,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


248it [00:40,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


250it [00:40,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


252it [00:40,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


254it [00:41,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


256it [00:41,  6.23it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


258it [00:41,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


260it [00:42,  6.21it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


262it [00:42,  6.28it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


264it [00:42,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


266it [00:43,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


268it [00:43,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


270it [00:43,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


272it [00:43,  6.39it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


274it [00:44,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


276it [00:44,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


278it [00:44,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


280it [00:45,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


282it [00:45,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


284it [00:45,  6.64it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


286it [00:46,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


287it [00:46,  6.03it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


289it [00:46,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


291it [00:46,  6.03it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


293it [00:47,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


295it [00:47,  5.85it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


296it [00:47,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


298it [00:48,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


300it [00:48,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])


301it [00:48,  5.74it/s]

The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


303it [00:49,  5.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


304it [00:49,  5.44it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


306it [00:49,  5.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


308it [00:49,  5.70it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


310it [00:50,  5.80it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


312it [00:50,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


314it [00:50,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


315it [00:51,  5.79it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


317it [00:51,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


319it [00:51,  5.86it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


321it [00:52,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


322it [00:52,  5.69it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


324it [00:52,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


326it [00:53,  5.69it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


327it [00:53,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


329it [00:53,  5.78it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


331it [00:53,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


333it [00:54,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


335it [00:54,  5.89it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


337it [00:54,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


338it [00:55,  5.83it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


340it [00:55,  5.77it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


341it [00:55,  5.83it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


343it [00:55,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


345it [00:56,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


347it [00:56,  5.92it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


348it [00:56,  5.81it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


350it [00:57,  5.73it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


351it [00:57,  5.74it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


353it [00:57,  5.66it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])


354it [00:57,  5.66it/s]

encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


356it [00:58,  5.81it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


358it [00:58,  5.76it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


360it [00:58,  5.83it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


362it [00:59,  5.91it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


364it [00:59,  5.84it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


365it [00:59,  5.83it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


367it [01:00,  5.98it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


369it [01:00,  5.94it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


371it [01:00,  5.95it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


373it [01:01,  6.11it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


375it [01:01,  6.01it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])


376it [01:01,  6.08it/s]

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


378it [01:01,  6.02it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


380it [01:02,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


382it [01:02,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


384it [01:02,  6.17it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


386it [01:03,  6.26it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


388it [01:03,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


390it [01:03,  6.34it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


392it [01:04,  6.25it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


394it [01:04,  6.22it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


396it [01:04,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


398it [01:05,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


400it [01:05,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


402it [01:05,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


404it [01:06,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


406it [01:06,  6.29it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


408it [01:06,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


410it [01:07,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


412it [01:07,  6.40it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


414it [01:07,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


416it [01:07,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


418it [01:08,  6.38it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


420it [01:08,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


422it [01:08,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


424it [01:09,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


426it [01:09,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


428it [01:09,  6.51it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


430it [01:10,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


432it [01:10,  6.52it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


434it [01:10,  6.50it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


436it [01:11,  6.57it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


438it [01:11,  6.55it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


440it [01:11,  6.54it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


442it [01:11,  6.33it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


444it [01:12,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


446it [01:12,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


448it [01:12,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


450it [01:13,  6.58it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


452it [01:13,  6.63it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


454it [01:13,  6.41it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


456it [01:14,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


458it [01:14,  6.48it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


460it [01:14,  6.42it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


462it [01:15,  6.45it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


464it [01:15,  6.47it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


466it [01:15,  6.43it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


468it [01:15,  6.44it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


470it [01:16,  6.35it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


472it [01:16,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


474it [01:16,  6.31it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


476it [01:17,  6.27it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


478it [01:17,  6.30it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


480it [01:17,  6.36it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


482it [01:18,  6.10it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


484it [01:18,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


486it [01:18,  6.15it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


488it [01:19,  6.20it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


490it [01:19,  6.00it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


492it [01:19,  6.09it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


494it [01:20,  6.05it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


496it [01:20,  6.06it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


498it [01:20,  6.12it/s]

src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])


500it [01:21,  6.16it/s]


src_que is:torch.Size([2, 100])
src_ans is:torch.Size([2, 100])
trg_que is:torch.Size([2, 1])
trg_ans is:torch.Size([2, 1])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
encoder_output shape is:torch.Size([2, 100, 256])
trg_linear shape is:torch.Size([2, 1, 256])
attent

attention output shape is:torch.Size([2, 100, 256])
The model output shape is:torch.Size([2, 1, 2])
At epoch-9	The training loss is:0.5565123121366308
Train accuracy is:0.87
At epoch-9	The test loss is:0.5565123121366308
Test accuracy is:0.87
