In [2]:
pip install transformers xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0
[0mNote: you may need to restart the kernel to use updated packages.


# Natural Data

In [3]:
from transformers import GPT2Tokenizer
import xmltodict
from gzip import GzipFile
import pandas as pd

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Galician

In [52]:
pairs_gl = []

# import tmx to array
def get_gl_pair(_, tree):
    lang_pair = {}
    for elem in tree['tuv']:
        language = elem['@xml:lang']
        text = elem['seg']
        lang_pair[language] = text

    pairs_gl.append(lang_pair)
    return True

xmltodict.parse(
    GzipFile('natural_data/raw/en-gl.tmx.gz'),
    item_depth=3, item_callback=get_gl_pair,
)

In [53]:
# Shuffle the set
pairs_gl = pd.DataFrame(pairs_gl).sample(frac=1)

# Check original data
tc_gl = 0
for i in range(len(pairs_gl)):
  if bool(pairs_gl.iloc[i]['gl']):
    tc_gl += len(tokenizer(pairs_gl.iloc[i]['gl'])['input_ids'])
print('Original sentence count: ', len(pairs_gl))
print('Original data token count: ', tc_gl)

# Splitting the data
tc_train, tc_val, tc_test, index = 0, 0, 0, 0
index_train, index_val, index_test = 0, 0, 0

# Training set 900000 tokens
while tc_train < 900000:
  if bool(pairs_gl.iloc[index]['gl']):
    tc_train += len(tokenizer(pairs_gl.iloc[index]['gl'])['input_ids'])
  index += 1
index_train = index
print('Train sentence count: ', index)
print('Train token count: ', tc_train)
train_gl = pairs_gl[:index]
pd.DataFrame(pd.DataFrame(train_gl)['gl']).to_csv('natural_data/train/gl-en.gl', header=False, index=False)
pd.DataFrame(pd.DataFrame(train_gl)['en']).to_csv('natural_data/train/gl-en.en', header=False, index=False)

# Validation set 100000 tokens
while tc_val < 100000:
  if bool(pairs_gl.iloc[index]['gl']):
    tc_val += len(tokenizer(pairs_gl.iloc[index]['gl'])['input_ids'])
  index += 1
index_val = index
print('Validation sentence count: ', index - index_train)
print('Validation token count: ', tc_val)
val_gl = pairs_gl[index_train:index]
pd.DataFrame(pd.DataFrame(val_gl)['gl']).to_csv('natural_data/val/gl-en.gl', header=False, index=False)
pd.DataFrame(pd.DataFrame(val_gl)['en']).to_csv('natural_data/val/gl-en.en', header=False, index=False)

# Test set 100000 tokens
while tc_test < 100000:
  if bool(pairs_gl.iloc[index]['gl']):
    tc_test += len(tokenizer(pairs_gl.iloc[index]['gl'])['input_ids'])
  index += 1
print('Test sentence count: ', index - index_val)
print('Test token count: ', tc_test)
test_gl = pairs_gl[index_val:index]
pd.DataFrame(pd.DataFrame(val_gl)['gl']).to_csv('test_data/gl-en.gl', header=False, index=False)
pd.DataFrame(pd.DataFrame(val_gl)['en']).to_csv('test_data/gl-en.en', header=False, index=False)

Original sentence count:  33574
Original data token count:  1132429
Train sentence count:  26650
Train token count:  900045
Validation sentence count:  2957
Validation token count:  100099
Test sentence count:  2997
Test token count:  100017


German

In [4]:
pairs_de = []

# import tmx to array
def get_de_pair(_, tree):
    lang_pair = {}
    for elem in tree['tuv']:
        language = elem['@xml:lang']
        text = elem['seg']
        lang_pair[language] = text

    pairs_de.append(lang_pair)
    return True

xmltodict.parse(
    GzipFile('natural_data/raw/de-en.tmx.gz'),
    item_depth=3, item_callback=get_de_pair,
)

In [51]:
# Shuffle the set
pairs_de = pd.DataFrame(pairs_de).sample(frac=1)

# Check original data
tc_de = 0
for i in range(len(pairs_de)):
  if bool(pairs_de.iloc[i]['de']):
    tc_de += len(tokenizer(pairs_de.iloc[i]['de'])['input_ids'])
print('Original sentence count: ', len(pairs_de))
print('Original data token count: ', tc_de)

# Splitting the data
tc_train, tc_val, tc_test, index = 0, 0, 0, 0
index_train, index_val, index_test = 0, 0, 0

# Training set 900000 tokens
while tc_train < 900000:
  if bool(pairs_de.iloc[index]['de']):
    tc_train += len(tokenizer(pairs_de.iloc[index]['de'])['input_ids'])
  index += 1
index_train = index
print('Train sentence count: ', index)
print('Train token count: ', tc_train)
train_de = pairs_de[:index]
pd.DataFrame(pd.DataFrame(train_de)['de']).to_csv('natural_data/train/de-en.de', header=False, index=False)
pd.DataFrame(pd.DataFrame(train_de)['en']).to_csv('natural_data/train/de-en.en', header=False, index=False)

# Validation set 100000 tokens
while tc_val < 100000:
  if bool(pairs_de.iloc[index]['de']):
    tc_val += len(tokenizer(pairs_de.iloc[index]['de'])['input_ids'])
  index += 1
index_val = index
print('Validation sentence count: ', index - index_train)
print('Validation token count: ', tc_val)
val_de = pairs_de[index_train:index]
pd.DataFrame(pd.DataFrame(val_de)['de']).to_csv('natural_data/val/de-en.de', header=False, index=False)
pd.DataFrame(pd.DataFrame(val_de)['en']).to_csv('natural_data/val/de-en.en', header=False, index=False)

# Test set 100000 tokens
while tc_test < 100000:
  if bool(pairs_de.iloc[index]['de']):
    tc_test += len(tokenizer(pairs_de.iloc[index]['de'])['input_ids'])
  index += 1
print('Test sentence count: ', index - index_val)
print('Test token count: ', tc_test)
test_de = pairs_de[index_val:index]
pd.DataFrame(pd.DataFrame(test_de)['de']).to_csv('test_data/de-en.de', header=False, index=False)
pd.DataFrame(pd.DataFrame(test_de)['en']).to_csv('test_data/de-en.en', header=False, index=False)

Original sentence count:  289374
Original data token count:  11438471
Train sentence count:  22802
Train token count:  900004
Validation sentence count:  2489
Validation token count:  100026
Test sentence count:  2485
Test token count:  100017


# Synthetic Data

Galician

In [9]:
synth_gl = pd.read_csv('synthetic_data/raw/gl-en.gl', index_col=0)

# Shuffle the set and remove duplicates
synth_gl = pd.DataFrame(pd.DataFrame(synth_gl).sample(frac=1)['0'].unique())

# Check original data
tc_gl = 0
for i in range(len(synth_gl)):
  if bool(synth_gl.iloc[i][0]) and not pd.isna(synth_gl.iloc[i][0]):
        tc_gl += len(tokenizer(synth_gl.iloc[i][0])['input_ids'])
print('Original sentence count: ', len(synth_gl))
print('Original data token count: ', tc_gl)

# Splitting the data
tc_train, tc_val, tc_test, index = 0, 0, 0, 0
index_train, index_val, index_test = 0, 0, 0

# Training set 900000 tokens
while tc_train < 900000:
  if bool(synth_gl.iloc[index][0]) and not pd.isna(synth_gl.iloc[index][0]):
    tc_train += len(tokenizer(synth_gl.iloc[index][0])['input_ids'])
  index += 1
index_train = index
print('Train sentence count: ', index)
print('Train token count: ', tc_train)
train_gl = synth_gl[:index]
pd.DataFrame(pd.DataFrame(train_gl)).to_csv('synthetic_data/train/gl-en.gl', header=False, index=False)

# Validation set 100000 tokens
while tc_val < 100000:
  if bool(synth_gl.iloc[index][0]) and not pd.isna(synth_gl.iloc[index][0]):
    tc_val += len(tokenizer(synth_gl.iloc[index][0])['input_ids'])
  index += 1
index_val = index
print('Validation sentence count: ', index - index_train)
print('Validation token count: ', tc_val)
val_gl = synth_gl[index_train:index]
pd.DataFrame(pd.DataFrame(val_gl)).to_csv('synthetic_data/val/gl-en.gl', header=False, index=False)

Original sentence count:  133692
Original data token count:  2609402
Train sentence count:  46545
Train token count:  900019
Validation sentence count:  5138
Validation token count:  100009


German

In [11]:
synth_de = pd.read_csv('synthetic_data/raw/de-en.de', header=None)

# Shuffle the set and remove duplicates
synth_de = pd.DataFrame(pd.DataFrame(synth_de).sample(frac=1)[0].unique())

# Check original data
tc_de = 0
for i in range(len(synth_de)):
  if bool(synth_de.iloc[i][0]) and not pd.isna(synth_de.iloc[i][0]):
        tc_de += len(tokenizer(synth_de.iloc[i][0])['input_ids'])
print('Original sentence count: ', len(synth_de))
print('Original data token count: ', tc_de)

# Splitting the data
tc_train, tc_val, tc_test, index = 0, 0, 0, 0
index_train, index_val, index_test = 0, 0, 0

# Training set 900000 tokens
while tc_train < 900000:
  if bool(synth_de.iloc[index][0]) and not pd.isna(synth_de.iloc[index][0]):
    tc_train += len(tokenizer(synth_de.iloc[index][0])['input_ids'])
  index += 1
index_train = index
print('Train sentence count: ', index)
print('Train token count: ', tc_train)
train_de = synth_de[:index]
pd.DataFrame(pd.DataFrame(train_de)).to_csv('synthetic_data/train/de-en.de', header=False, index=False)

# Validation set 100000 tokens
while tc_val < 100000:
  if bool(synth_de.iloc[index][0]) and not pd.isna(synth_de.iloc[index][0]):
    tc_val += len(tokenizer(synth_de.iloc[index][0])['input_ids'])
  index += 1
index_val = index
print('Validation sentence count: ', index - index_train)
print('Validation token count: ', tc_val)
val_de = synth_de[index_train:index]
pd.DataFrame(pd.DataFrame(val_de)).to_csv('synthetic_data/val/de-en.de', header=False, index=False)

Original sentence count:  71834
Original data token count:  1166447
Train sentence count:  55745
Train token count:  900002
Validation sentence count:  6112
Validation token count:  100002
