# Learnings

`torchtext.data.Field`

`torchtext.data.TabularDataset`

`torchtext.data.BucketIterator`

# Example 1

Machine translation English <--> German.
Original dataset is a parallel corpus of sentences in English and German.
One file contains the sorted sentences for each language.

File processing is necessary to have the raw input data stored in a single JSON file.

## Save data as JSON

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

english_txt = open('train.en', encoding='utf8').read().split('\n')
german_txt = open('train.de', encoding='utf8').read().split('\n')

raw_data = {
    'english': [line for line in english_txt[:1000]],
    'german': [line for line in german_txt[:1000]],
}

df = pd.DataFrame(raw_data, columns=['english', 'german'])
train, test = train_test_split(df, test_size=0.2)

train.to_json('train.json', orient='records', lines=True)
test.to_json('test.json', orient='records', lines=True)

## Load data from JSON using torchtext declarative API

## Define tokenization functions

In [None]:
import spacy

# Before run this in terminal: python -m spacy download en
spacy_eng = spacy.load('en')
spacy_ger = spacy.load('de')

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [16]:
from torchtext.data import Field, BucketIterator, TabularDataset

# Fields define a datatype together with the instructions for transforming to tensor
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

fields = {'english': ('eng', english), 'german': ('ger', german)}

train_data, test_data = TabularDataset.splits(
    path='',
    train='train.json',
    test='test.json',
    format='json',
    fields=fields,
)

english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=32,
#     device='cuda'
)

for batch in train_iterator:
    print(batch)
    break


[torchtext.data.batch.Batch of size 32]
	[.eng]:[torch.LongTensor of size 97x32]
	[.ger]:[torch.LongTensor of size 83x32]


In [12]:
print(type(train_data))
print(train_data[0])
print(len(train_data))
print(len(test_data))

<class 'torchtext.data.dataset.TabularDataset'>
<torchtext.data.example.Example object at 0x1991e0750>
800
200


------

# Example 2