# Data Gathering and Processing




In [None]:
# Create a directory and clone the Github MT-Preparation repository
!mkdir nmt
%cd nmt
!git clone https://github.com/ymoslem/MT-Preparation.git

mkdir: cannot create directory ‘nmt’: File exists
/content/nmt
fatal: destination path 'MT-Preparation' already exists and is not an empty directory.


In [None]:
# Install the requirements
!pip3 install -r MT-Preparation/requirements.txt



# Datasets

Used datasets:

* EN-VI: https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-vi.txt.zip
* FR-VI: https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/fr-vi.txt.zip
* DE-VI: https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/de-vi.txt.zip

In [None]:
# Download and unzip a dataset
#EN-VI
!wget https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-vi.txt.zip
!unzip en-vi.txt.zip

#FR-VI
!wget https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/fr-vi.txt.zip
!unzip fr-vi.txt.zip

#DE-VI
!wget https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/de-vi.txt.zip
!unzip de-vi.txt.zip

--2023-12-01 05:31:18--  https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-vi.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24205565 (23M) [application/zip]
Saving to: ‘en-vi.txt.zip’


2023-12-01 05:31:19 (40.2 MB/s) - ‘en-vi.txt.zip’ saved [24205565/24205565]

Archive:  en-vi.txt.zip
  inflating: README                  
  inflating: LICENSE                 
  inflating: TED2020.en-vi.en        
  inflating: TED2020.en-vi.vi        
  inflating: TED2020.en-vi.xml       
--2023-12-01 05:31:20--  https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/fr-vi.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25231926 (24M) [app

# Merging the data sets into a single alligned dataset


In [None]:
def merge(*language_pairs):
    # Read data from each language pair
    data = {}
    for lang_pair in language_pairs:
        lang, source_path, target_path = lang_pair

        # Adjust the file paths based on the mounted Google Drive
        source_path = f'{source_path}'
        target_path = f'{target_path}'

        with open(source_path, 'r', encoding='utf-8') as source_file:
            source_lines = source_file.readlines()

        with open(target_path, 'r', encoding='utf-8') as target_file:
            target_lines = target_file.readlines()

        # Ensure both source and target have the same number of lines
        assert len(source_lines) == len(target_lines), f"Mismatched number of lines in {lang} source and target files."

        # Add a newline character between each line in the source file
        source_lines = [line.strip() + '\n' for line in source_lines]

        data[lang] = list(zip(source_lines, target_lines))

    merged_data = list(zip(*[data[lang] for lang in data]))

    output_path_source = "merged.source"
    output_path_target = "merged.vi"

    with open(output_path_source, 'w', encoding='utf-8') as output_file_source:
        with open(output_path_target, 'w', encoding='utf-8') as output_file_target:
            for line_tuple in merged_data:
                for source_line, target_line in line_tuple:
                    output_file_source.write(source_line)
                    output_file_target.write(target_line)

    return output_path_source, output_path_target

# Example language pairs
lang_pairs = [
    ("en", "TED2020.en-vi.en", "TED2020.en-vi.vi"),
    ("fr", "TED2020.fr-vi.fr", "TED2020.fr-vi.vi"),
    ("de", "TED2020.de-vi.de", "TED2020.de-vi.vi"),
    # Add more language pairs as needed
]

# Merge and shuffle
merged_source_path, merged_target_path = merge(*lang_pairs)

# Data Filtering

Filtering out low-quality segments can help improve the translation quality of the output MT model. This might include misalignments, empty segments, duplicates, among other issues.

In [None]:
# Filter the dataset
# Arguments: source file, target file, source language, target language
!python3 MT-Preparation/filtering/filter.py merged.source merged.vi source vi

Dataframe shape (rows, columns): (794115, 2)
--- Rows with Empty Cells Deleted	--> Rows: 786386
--- Duplicates Deleted			--> Rows: 776629
--- Source-Copied Rows Deleted		--> Rows: 776629
--- Too Long Source/Target Deleted	--> Rows: 728325
--- HTML Removed			--> Rows: 728325
--- Rows will remain in true-cased	--> Rows: 728325
--- Rows with Empty Cells Deleted	--> Rows: 728325
--- Rows Shuffled			--> Rows: 728325
--- Source Saved: merged.source-filtered.source
--- Target Saved: merged.vi-filtered.vi


# Tokenization / Sub-wording



In [None]:
!ls MT-Preparation/subwording/

1-train_bpe.py	1-train_unigram.py  2-subword.py  3-desubword.py


In [None]:
# Train a SentencePiece model for subword tokenization
!python MT-Preparation/subwording/1-train_unigram.py merged.source-filtered.source merged.vi-filtered.vi

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=merged.source-filtered.source --model_prefix=source --vocab_size=150000 --hard_vocab_limit=false --split_digits=true
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: merged.source-filtered.source
  input_format: 
  model_prefix: source
  model_type: UNIGRAM
  vocab_size: 150000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 0
  use_all_vocab: 0
  unk_id: 0

In [None]:
!ls

de-vi.txt.zip		       merged.vi	      target.model	 TED2020.en-vi.vi
en-vi.txt.zip		       merged.vi-filtered.vi  target.vocab	 TED2020.en-vi.xml
fr-vi.txt.zip		       MT-Preparation	      TED2020.de-vi.de	 TED2020.fr-vi.fr
LICENSE			       README		      TED2020.de-vi.vi	 TED2020.fr-vi.vi
merged.source		       source.model	      TED2020.de-vi.xml  TED2020.fr-vi.xml
merged.source-filtered.source  source.vocab	      TED2020.en-vi.en


In [None]:
# Subword the dataset
!python3 MT-Preparation/subwording/2-subword.py source.model target.model merged.source-filtered.source merged.vi-filtered.vi

Source Model: source.model
Target Model: target.model
Source Dataset: merged.source-filtered.source
Target Dataset: merged.vi-filtered.vi
Done subwording the source file! Output: merged.source-filtered.source.subword
Done subwording the target file! Output: merged.vi-filtered.vi.subword


In [None]:
# First 3 lines before subwording
!head -n 3 merged.source-filtered.source && echo "-----" && head -n 3 merged.vi-filtered.vi

Ich weiß nicht, wie Sie es sehen, aber ich stelle mir dabei eine comicreife Szene vor. Etwa einen Mann, der den Gehweg entlanggeht, versehentlich in einen offenen Gullyschacht tritt und in den darunterliegenden Abwasserkanal fällt.
You go straight home to Jesus."
Das ist sinnlos." Du musst einen Weg zu finden, um schneller an Geld zu kommen.
-----
Không bạn nghĩ thế nào nhưng khi tôi dùng ẩn dụ này, cái mà tôi hình dung ra là một bộ phim hoạt hình kiểu có 1 người đàn ông, anh ta đang đi bộ trên vỉa hè, mà không nhận ra rằng, anh ta đang băng qua 1 hố ga mở nắp và anh ta rơi xuống cái cống bên dưới 
Bác hãy đi đến ngôi nhà của Chúa." 
Không thể hiểu nổi." Bạn có thể tìm ra 1 cách kiếm tiềm nhanh hơn. 


In [None]:
# First 3 lines after subwording
!head -n 3 merged.source-filtered.source.subword && echo "---" && head -n 3 merged.vi-filtered.vi.subword

▁Ich ▁weiß ▁nicht , ▁wie ▁Sie ▁es ▁sehen , ▁aber ▁ich ▁stelle ▁mir ▁dabei ▁eine ▁comic reife ▁Szene ▁vor . ▁Etwa ▁einen ▁Mann , ▁der ▁den ▁Gehweg ▁entlanggeht , ▁versehentlich ▁in ▁einen ▁offenen ▁Gully schacht ▁tritt ▁und ▁in ▁den ▁darunter liegenden ▁Abwasserkan al ▁fällt .
▁You ▁go ▁straight ▁home ▁to ▁Jesus ."
▁Das ▁ist ▁sinnlos ." ▁Du ▁musst ▁einen ▁Weg ▁zu ▁finden , ▁um ▁schneller ▁an ▁Geld ▁zu ▁kommen .
---
▁Không ▁bạn ▁nghĩ ▁thế ▁nào ▁nhưng ▁khi ▁tôi ▁dùng ▁ẩn ▁dụ ▁này , ▁cái ▁mà ▁tôi ▁hình ▁dung ▁ra ▁là ▁một ▁bộ ▁phim ▁hoạt ▁hình ▁kiểu ▁có ▁ 1 ▁người ▁đàn ▁ông , ▁anh ▁ta ▁đang ▁đi ▁bộ ▁trên ▁v ỉa ▁hè , ▁mà ▁không ▁nhận ▁ra ▁rằng , ▁anh ▁ta ▁đang ▁băng ▁qua ▁ 1 ▁hố ▁ga ▁mở ▁nắp ▁và ▁anh ▁ta ▁rơi ▁xuống ▁cái ▁cống ▁bên ▁dưới
▁Bác ▁hãy ▁đi ▁đến ▁ngôi ▁nhà ▁của ▁Chúa ."
▁Không ▁thể ▁hiểu ▁nổi ." ▁Bạn ▁có ▁thể ▁tìm ▁ra ▁ 1 ▁cách ▁kiếm ▁tiềm ▁nhanh ▁hơn .


# Data Splitting

Split our dataset into 3 portions:

1. training dataset - used for training the model;
2. development dataset - used to run regular validations during the training to help improve the model parameters;
3. testing dataset - a holdout dataset used after the model finishes training to finally evaluate the model on unseen data.

In [None]:
# Split the dataset into training set, development set, and test set
# Development and test sets should be between 1000 and 5000 segments (here we chose 2000)
!python MT-Preparation/train_dev_split/train_dev_test_split.py 2000 2000 merged.source-filtered.source.subword merged.vi-filtered.vi.subword

Dataframe shape: (728325, 2)
--- Empty Cells Deleted --> Rows: 728325
--- Wrote Files
Done!
Output files
merged.source-filtered.source.subword.train
merged.vi-filtered.vi.subword.train
merged.source-filtered.source.subword.dev
merged.vi-filtered.vi.subword.dev
merged.source-filtered.source.subword.test
merged.vi-filtered.vi.subword.test


In [None]:
# Line count for the subworded train, dev, test datatest
!wc -l *.subword.*

     2000 merged.source-filtered.source.subword.dev
     2000 merged.source-filtered.source.subword.test
   724325 merged.source-filtered.source.subword.train
     2000 merged.vi-filtered.vi.subword.dev
     2000 merged.vi-filtered.vi.subword.test
   724325 merged.vi-filtered.vi.subword.train
  1456650 total


In [None]:
# Check the first and last line from each dataset

!echo "---First line---"
!head -n 1 *.{train,dev,test}

!echo -e "\n---Last line---"
!tail -n 1 *.{train,dev,test}

My name is: DIO 

---First line---
==> merged.source-filtered.source.subword.train <==
▁Ich ▁weiß ▁nicht , ▁wie ▁Sie ▁es ▁sehen , ▁aber ▁ich ▁stelle ▁mir ▁dabei ▁eine ▁comic reife ▁Szene ▁vor . ▁Etwa ▁einen ▁Mann , ▁der ▁den ▁Gehweg ▁entlanggeht , ▁versehentlich ▁in ▁einen ▁offenen ▁Gully schacht ▁tritt ▁und ▁in ▁den ▁darunter liegenden ▁Abwasserkan al ▁fällt .

==> merged.vi-filtered.vi.subword.train <==
▁Không ▁bạn ▁nghĩ ▁thế ▁nào ▁nhưng ▁khi ▁tôi ▁dùng ▁ẩn ▁dụ ▁này , ▁cái ▁mà ▁tôi ▁hình ▁dung ▁ra ▁là ▁một ▁bộ ▁phim ▁hoạt ▁hình ▁kiểu ▁có ▁ 1 ▁người ▁đàn ▁ông , ▁anh ▁ta ▁đang ▁đi ▁bộ ▁trên ▁v ỉa ▁hè , ▁mà ▁không ▁nhận ▁ra ▁rằng , ▁anh ▁ta ▁đang ▁băng ▁qua ▁ 1 ▁hố ▁ga ▁mở ▁nắp ▁và ▁anh ▁ta ▁rơi ▁xuống ▁cái ▁cống ▁bên ▁dưới

==> merged.source-filtered.source.subword.dev <==
▁And ▁some ▁of ▁them ▁are ▁asked ▁to ▁do ▁the ▁task ▁right ▁away .

==> merged.vi-filtered.vi.subword.dev <==
▁Và ▁một ▁số ▁người ▁được ▁yêu ▁cầu ▁thực ▁hiện ▁công ▁việc ▁luôn .

==> merged.source-filtered.source.sub

# Mount your drive to save your data



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Move your data to your Google Drive
%cd ..
!mv nmt /content/drive/MyDrive/

/content
