<a href="https://colab.research.google.com/github/Syilun/Machine-Learning-Collections/blob/main/ML/Pytorch/More_advanced/Torchtext/torchtext_tutorial3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import pandas as pd
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split


In [4]:
### Load data from two text files where each row is a sentence ###
english_txt = open("train_WMT_english.txt", encoding="utf8").read().split("\n")
german_txt = open("train_WMT_german.txt", encoding="utf8").read().split("\n")

raw_data = {
    "English": [line for line in english_txt[0:1000]],
    "German": [line for line in german_txt[0:1000]],
}

df = pd.DataFrame(raw_data, columns=["English", "German"])

# create train and test set
train, test = train_test_split(df, test_size=0.1)

# Get train, test data to json and csv format which can be read by torchtext
train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)


In [8]:
### Now we're back to where we were in previous Tutorials ###

"""
To install spacy languages use:
python -m spacy download en
python -m spacy download de
"""
!python -m spacy download en
!python -m spacy download de

spacy_eng = spacy.load('en')
spacy_ger = spacy.load('de')


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

fields = {"English": ("eng", english), "German": ("ger", german)}

train_data, test_data = TabularDataset.splits(
    path="", train="train.json", test="test.json", format="json", fields=fields
)

english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=32, device="cuda"
)

for batch in train_iterator:
    print(batch)

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 2.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 2.8 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=8d423f8