In [3]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [6]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [10]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [11]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [74]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [13]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [19]:
print(vars(train_data.examples[3]))

{'src': ['.', 'fenster', 'ein', 'putzt', 'und', 'leiter', 'einer', 'auf', 'steht', 'hemd', 'blauen', 'einem', 'in', 'mann', 'ein'], 'trg': ['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.']}


In [26]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [32]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [123]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 64)

In [261]:
for i,t in enumerate(train_iterator):
    print(t.trg.shape)
    for i, s in enumerate(t.trg):
        #print(s[0])
        print(TRG.vocab.itos[s[0].item()], str(s[0].item()))

torch.Size([29, 64])
<sos> 2
a 4
crowd 87
of 12
boys 127
riding 78
a 4
vehicle 529
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([30, 64])
<sos> 2
a 4
brown 61
dog 35
wades 3787
through 60
water 47
and 11
walks 125
toward 458
a 4
rock 166
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([38, 64])
<sos> 2
a 4
man 9
walking 41
along 124
the 7
street 39
carrying 151
a 4
garbage 1120
bag 265
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([26, 64])
<sos> 2
elderly 233
man 9
with 13
a 4
cane 865
bends 2097
over 76
near 80
a 4
man 9
and 11
woman 14
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<

<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([26, 64])
<sos> 2
a 4
pitcher 939
throws 782
the 7
ball 68
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([36, 64])
<sos> 2
the 7
<unk> 0
is 10
<unk> 0
his 27
boat 180
in 6
the 7
canal 2264
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([29, 64])
<sos> 2
a 4
woman 14
sits 91
against 236
a 4
wall 108
in 6
a 4
fancy 1322
building 77
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([29, 64])
<sos> 2
a 4
man 9
holding 45
onto 336
a 4
small 70
girl 33
on 8
a 4
bike 99
. 5

torch.Size([34, 64])
<sos> 2
a 4
little 53
girl 33
is 10
walking 41
by 49
the 7
beach 88
in 6
blue 29
and 11
red 31
rain 559
boots 515
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([29, 64])
<sos> 2
a 4
man 9
vacuuming 2628
near 80
a 4
toddler 357
in 6
a 4
soft 3200
green 52
painted 526
room 187
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([22, 64])
<sos> 2
a 4
little 53
girl 33
and 11
boy 34
on 8
the 7
back 182
of 12
a 4
bicycle 157
waiting 254
for 54
dad 1699
to 18
pedal 2800
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
torch.Size([27, 64])
<sos> 2
the 7
woman 14
crouches 1225
against 236
a 4
brick 291
wall 108
, 15
speaking 633
on 8
her 44
cellphone 286
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([37, 64])
<sos> 2
a 4


torch.Size([35, 64])
<sos> 2
a 4
dancer 987
on 8
a 4
stage 149
is 10
performing 186
a 4
move 838
that 114
involves 5169
her 44
fellow 2493
dancers 747
standing 36
behind 93
her 44
and 11
sticking 1251
out 75
their 66
arms 364
so 1945
that 114
it 141
looks 107
like 340
she 313
has 136
multiple 1129
arms 364
. 5
<eos> 3
<pad> 1
torch.Size([28, 64])
<sos> 2
a 4
friend 851
photo 342
with 13
two 16
boys 127
looking 56
at 20
each 167
other 72
while 28
the 7
girl 33
looks 107
at 20
the 7
camera 116
smiling 133
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([28, 64])
<sos> 2
asian 106
mother 496
and 11
child 55
are 17
looking 56
at 20
the 7
aquarium 2248
tank 314
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([31, 64])
<sos> 2
a 4
person 64
with 13
glasses 146
and 11
a 4
black 26
cap 283
bending 1183
over 76
equipment 421
in 6
the 7
dark 178
. 5
<eos> 3
<pad> 1
<pad> 

torch.Size([33, 64])
<sos> 2
3 691
male 161
bikers 1073
staring 585
at 20
the 7
left 463
side 156
of 12
the 7
picture 134
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([28, 64])
<sos> 2
cheerleaders 927
in 6
blue 29
performing 186
on 8
a 4
football 192
field 85
underneath 863
a 4
yellow 62
football 192
goal 767
post 1134
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([27, 64])
<sos> 2
a 4
dog 35
drops 3396
a 4
red 31
disc 2473
on 8
a 4
beach 88
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([27, 64])
<sos> 2
a 4
girl 33
in 6
a 4
blue 29
dress 117
standing 36
next 71
to 18
a 4
wooden 248
post 1134
on 8
sidewalk 84
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 

<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([31, 64])
<sos> 2
a 4
large 59
group 38
of 12
individuals 835
are 17
gathered 359
around 83
a 4
bright 278
light 299
at 20
night 305
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([36, 64])
<sos> 2
three 48
dogs 112
race 164
in 6
the 7
snow 95
together 129
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([28, 64])
<sos> 2
a 4
dog 35
is 10
about 246
to 18
get 462
a 4
ball 68
that 114
is 10
on 8
orange 86
carpet 1264
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([24, 64])
<sos> 2
a 4
little 53
boy 34
in 6
a 4
red 31
shirt 23
talking 119
on 8
a 4
cellphon

asian 106
woman 14
in 6
an 21
orange 86
dress 117
speaking 633
to 18
someone 290
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([31, 64])
<sos> 2
a 4
group 38
of 12
people 19
gather 569
and 11
take 389
pictures 426
in 6
an 21
inner 4128
city 101
common 4812
grounds 5067
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([27, 64])
<sos> 2
a 4
woman 14
in 6
black 26
and 11
high 266
heels 1232
with 13
a 4
red 31
bag 265
is 10
walking 41
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([24, 64])
<sos> 2
black 26
dog 35
jumping 92
into 69
the 7
air 103
to 18
catch 366
a 4
frisbee 409
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([30, 64])
<sos> 2
a 4
man 9
preparing 370
a 4

torch.Size([36, 64])
<sos> 2
a 4
smiling 133
artist 697
is 10
kneeling 671
on 8
the 7
sidewalk 84
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([42, 64])
<sos> 2
downhill 2286
skaters 1824
wear 948
padding 3549
to 18
protect 2358
themselves 877
when 2414
they 244
fall 1118
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([31, 64])
<sos> 2
older 115
white 25
male 161
smoking 539
a 4
cigar 2270
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([28, 64])
<sos> 2


torch.Size([29, 64])
<sos> 2
a 4
bicycler 4662
balances 1465
his 27
bike 99
on 8
a 4
rail 661
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([27, 64])
<sos> 2
a 4
fat 2296
police 362
officer 705
and 11
a 4
fat 2296
lady 120
sitting 32
together 129
while 28
the 7
police 362
officer 705
flips 1708
through 60
paperwork 1805
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([40, 64])
<sos> 2
a 4
little 53
boy 34
trying 260
to 18
climb 813
up 51
. 5
<eos> 3
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
<pad> 1
torch.Size([30, 64])
<sos> 2
a 4
boy 34
wearing 22
dark 178
clothes 294
is 10
lying 547
on 8
a 4
red 31
blanket 580
at 20
the 7
opening 914
of 12
a 4
tent

In [238]:
TRG.vocab.stoi['bye']

0

In [273]:
a = 'hello my name is'

In [274]:
list(a)

['h',
 'e',
 'l',
 'l',
 'o',
 ' ',
 'i',
 ' ',
 'a',
 'm',
 ' ',
 's',
 'h',
 'e',
 'l',
 'd',
 'o',
 'n']