In [1]:
# from http://www.statmt.org/europarl/

europarl_en = open(r'D:\BernieData\ML4D_copy\data/europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open(r'D:\BernieData\ML4D_copy\data/europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [3]:
# tokenize the text using Torchtext and Spacy together
import spacy
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset

#en = spacy.load('en')
#fr = spacy.load('fr')
en = spacy.load('en_core_web_sm')
fr = spacy.load('fr_core_news_sm')

def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]


EN_TEXT = Field(tokenize=tokenize_en)
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")


In [4]:
# turn our data into an appropriate CSV file
import pandas as pd

raw_data = {'English' : [line for line in europarl_en], 'French': [line for line in europarl_fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])

# remove very long sentences and sentences where translations are
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')

In [5]:
# have to create a validation set
from sklearn.model_selection import train_test_split

# create train and validation set
train, val = train_test_split(df, test_size=0.1)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)

In [6]:
# Calling the magic TabularDataset.splits then returns a train and validation dataset with
# the respective data loaded into them, processed(/tokenized) according to the fields we
# defined earlier.

# associate the text in the 'English' column with the EN_TEXT field,
# and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]
train,val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', format='csv', fields=data_fields)

In [7]:
# index all the tokens

FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

In [10]:
print(EN_TEXT.vocab.stoi['the'])
print(EN_TEXT.vocab.itos[11])

2
a


In [11]:
#example input: print(EN_TEXT.vocab.stoi['the'])
#example_output: 11
#example input: print(EN_TEXT.vocab.itos[11])
#example_output: 'the'
    
train_iter = BucketIterator(train, batch_size=20, sort_key=lambda x: len(x.French), shuffle=True)
batch = next(iter(train_iter))
print(batch.English)

tensor([[20984,  5961,    27,  ...,    68,  8574,    50],
        [   86,  7561,    32,  ...,   328,   393,    30],
        [   14,   168,    28,  ...,     3,    30,     6],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])


In [13]:
# hacking TorchText
# code from http://nlp.seas.harvard.edu/2018/04/03/attention.html
# read text after for description of what it does
from torchtext import data
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch, len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch, len(new.French) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(sorted(p, key=self.sort_key), self.batch_size, self.batch_size_fn)
                for b in random_shuffler(list(p_batch)):
                    yield b  
            self.batches = pool(self.data(), self.random_shuffler)
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size, self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [17]:
# more efficient iterator
import torch
torch.device = 'cuda:0'
train_iter = MyIterator(train, batch_size=1300, repeat=False, sort_key= lambda x: (len(x.English), len(x.French)), batch_size_fn=batch_size_fn, train=True, shuffle=True)

In [18]:
batch = next(iter(train_iter)) # sentences vertically (columns), each index represents a token (word)
print(batch.English)

tensor([[ 2622,    27,    43,  ...,  5175, 18817,    43],
        [  522,  1060,     8,  ...,     3,     6,   880],
        [59264,     5,    15,  ...,   938,  1460,     2],
        ...,
        [    6, 15133,     5,  ...,    19,    49,  1575],
        [29028, 16011,  1772,  ...,    55,   370,  3917],
        [    4,     4,     4,  ...,     4,     4,     4]])


In [19]:
for i in batch.English[:,0]:
    print(EN_TEXT.vocab.itos[i])

Its
Article
I-41
,
paragraph
3
states
that
‘
the
Member
States
are
obliged
to
progressively
improve
their
military
capacities
’
,
which
means
that
the
EU
’s
Member
States
are
obliged
to
rearm
.
