In [9]:
%%bash
home=".."
data="$home/data"
data_generated="$data/generated"
threads=4
mosesdecoder="$home/ext-libs/mosesdecoder"
subword_nmt="$home/ext-libs/subword-nmt"

src_raw="$data/OpenSubtitles2018.fa-tr.fa"
trg_raw="$data/OpenSubtitles2018.fa-tr.tr"

mkdir "$data_generated" # Here we'll keep all our files

# First things first: tokenization
cat $src_raw | \
    $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl | \
    $mosesdecoder/scripts/tokenizer/tokenizer.perl -threads $threads > \
    $data_generated/src.tok
    
cat $trg_raw | \
    $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl | \
    $mosesdecoder/scripts/tokenizer/tokenizer.perl -threads $threads > \
    $data_generated/trg.tok
    

# Second things second: learning BPEs
num_tokens=8000

for domain in src trg
do
$subword_nmt/learn_joint_bpe_and_vocab.py \
    -i $data_generated/$domain.tok -s $num_tokens -o $data_generated/$domain.bpe \
    --write-vocabulary $data_generated/vocab.$domain
done
    

# Third things third: applying BPEs
for domain in src trg
do
cat $data_generated/$domain.tok | $subword_nmt/apply_bpe.py -c $data_generated/$domain.bpe \
    --vocabulary $data_generated/vocab.$domain --vocabulary-threshold 0 \
    -o $data_generated/$domain.tok.bpe
done

	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
Tokenizer Version 1.1
Language: en
Number of threads: 4
	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
	LANGUAGE = (unset),
	LC_ALL = (unset),
	LC_CTYPE = "UTF-8",
	LANG = "en_US.UTF-8"
    are supported and installed on your system.
Tokenizer Version 1.1
Language: en
Number of threads: 4


In [None]:
import fasttext

model_src = fasttext.skipgram('../data/generated/src.tok.bpe',
                              '../trained_models/src.tok.bpe_cbow',
                              dim=512, min_count=1, silent=0, thread=4)

model_src = fasttext.skipgram('../data/generated/trg.tok.bpe',
                              '../trained_models/trg.tok.bpe_cbow',
                              dim=512, min_count=1, silent=0, thread=4)

Let's split our datasets into parallel and unparallel corpora.

In [None]:
# from tqdm import tqdm

# print('Reading datasets')
# src_raw = open('../data/OpenSubtitles2018.fa-tr.fa', encoding='utf-8').read().splitlines()
# trg_raw = open('../data/OpenSubtitles2018.fa-tr.tr', encoding='utf-8').read().splitlines()

# # Let's remove sentences which are suspiciously long or short
# # and which have two much length ratio between source/target
# max_len = 200
# max_ratio = 3

# lines_to_remove = set()
# bad_ratio_lines = set()
# bad_min_len_lines = set()
# bad_max_len_lines = set()

# print('Searching for bad lines')
# for i in tqdm(range(len(src_raw))):
#     src_len = len(src_raw[i].split())
#     trg_len = len(trg_raw[i].split())
    
#     lens_are_ok = src_len <= max_len and trg_len <= max_len
#     ratios_are_ok = (1 / max_ratio) <= src_len / trg_len <= max_ratio
    
#     if not lens_are_ok or not ratios_are_ok: lines_to_remove.add(i)
        
# print('Num lines to remove:', len(lines_to_remove))
# src = [line for i, line in enumerate(src_raw) if not i in lines_to_remove]
# trg = [line for i, line in enumerate(trg_raw) if not i in lines_to_remove]
# print('Num lines left:', len(src))

In [22]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Now, we can split data into datasets
num_parallel = 10000
num_test = 1000
num_val = 1000

data = {}

split = train_test_split(src, trg, test_size=num_parallel, random_state=42)
data['src_mono'], data['src_parallel'], data['trg_mono'], data['trg_parallel'] = split

# Parallel corpus we should additionally split into train/val/test.
split = train_test_split(data['src_parallel'], data['trg_parallel'], test_size=num_test, random_state=42)
data['src_train'], data['src_test'], data['trg_train'], data['trg_test'] = split

split = train_test_split(data['src_train'], data['trg_train'], test_size=num_val, random_state=42)
data['src_train'], data['src_val'], data['trg_train'], data['trg_val'] = split

# Saving the results
for dataset_name in data:
    file_path = '../data/generated/{}.tok.bpe'.format(dataset_name)
    with open(file_path, 'w', encoding='utf-8') as f:
        for line in data[dataset_name]:
            f.write(line + '\n')

In [31]:
import matplotlib.pyplot as plt
%matplotlib inline

lens_src = [len(s.split()) for s in src]
lens_trg = [len(s.split()) for s in trg]