# Machine Translation

NLP - Spring Semester of 2024 at University of Tehran - CA5

In [None]:
!pip install fairseq
!pip install sentencepiece
!pip install sacremoses
!pip install tensorboardX

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from urllib.request import urlretrieve
import zipfile
import os
import shutil

import sentencepiece as spm

## Dataset

In [None]:
dataset_url = "https://object.pouta.csc.fi/OPUS-MIZAN/v1/moses/en-fa.txt.zip"
dataset_file_name = "moses_en-fa.zip"

urlretrieve(dataset_url, dataset_file_name)

TEMP_DIR = 'temp'
if not os.path.exists(TEMP_DIR):
    os.mkdir(TEMP_DIR)

with zipfile.ZipFile(dataset_file_name, 'r') as zip_file:
    zip_file.extractall(TEMP_DIR)

Now that we have the file downloaded, we'll take a peek into it. Let's count the lines and see first 3 lines of each file.

In [None]:
ENGLISH_FILE = os.path.join(TEMP_DIR, "MIZAN.en-fa.en")
PERSIAN_FILE = os.path.join(TEMP_DIR, "MIZAN.en-fa.fa")

english_lines = []
persian_lines = []

with open(ENGLISH_FILE, 'r') as english_file:
    english_lines = english_file.readlines()

with open(PERSIAN_FILE, 'r') as persian_file:
    persian_lines = persian_file.readlines()

shutil.rmtree(TEMP_DIR, ignore_errors=True)

df = pd.DataFrame(
    {
        'english': english_lines,
        'persian': persian_lines
    }
)

print(f'num of lines: {len(df)}')
print(df.head(3))

Now let's tokenize each line based on white spaces.

In [None]:
df['english_tokenized'] = df['english'].str.split()
df['persian_tokenized'] = df['persian'].str.split()

In [None]:
df['english_token_count'] = df['english_tokenized'].apply(len)
df['persian_token_count'] = df['persian_tokenized'].apply(len)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(df['english_token_count'], bins=range(1, max(df['english_token_count']) + 2), edgecolor='black')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Histogram of Token Count for English')
plt.xticks(range(1, max(df['english_token_count']) + 2, 10))

plt.subplot(1, 2, 2)
plt.hist(df['persian_token_count'], bins=range(1, max(df['persian_token_count']) + 2), edgecolor='black')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Histogram of Token Count for Persian')
plt.xticks(range(1, max(df['persian_token_count']) + 2, 10))

plt.tight_layout()
plt.show()

We have 1M lines which is a lot. We'll drop the ones that have less than 10 tokens or more than 50 tokens.

In [None]:
df = df[(df['persian_token_count'] >= 10) & (df['persian_token_count'] <= 50)]

Create the train, test, eval splits and store them in separate files.

In [None]:
N_TRAIN = 500000
N_TEST = 10000
N_EVAL = 5000

df = df.sample(frac=1).reset_index(drop=True)
train_df = df[:N_TRAIN]
test_df = df[N_TRAIN:N_TRAIN + N_TEST]
eval_df = df[N_TRAIN + N_TEST:N_TRAIN + N_TEST+N_EVAL]

SAVE_DIR = 'raw_data'
if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)

train_df['persian'].to_csv(os.path.join(SAVE_DIR, 'train.fa'))
train_df['english'].to_csv(os.path.join(SAVE_DIR, 'train.en'))

test_df['persian'].to_csv(os.path.join(SAVE_DIR, 'test.fa'))
test_df['english'].to_csv(os.path.join(SAVE_DIR, 'test.en'))

eval_df['persian'].to_csv(os.path.join(SAVE_DIR, 'valid.fa'))
eval_df['english'].to_csv(os.path.join(SAVE_DIR, 'valid.en'))

## Training the Tokenizer and Preprocessing

Let's use the sentencepiece command line tools to train our BPE model.

In [None]:
PERSIAN_TOKENIZER_NAME = 'persian_bpe'
if not os.path.exists(f'{PERSIAN_TOKENIZER_NAME}.model'):
    spm.SentencePieceTrainer.train(
        input=os.path.join(SAVE_DIR, 'train.fa'),
        model_prefix=PERSIAN_TOKENIZER_NAME,
        vocab_size=10000,
        model_type='bpe'
    )

In [None]:
ENGLISH_TOKENIZER_NAME = 'english_bpe'
if not os.path.exists(f'{ENGLISH_TOKENIZER_NAME}.model'):
    spm.SentencePieceTrainer.train(
        input=os.path.join(SAVE_DIR, 'train.en'),
        model_prefix=ENGLISH_TOKENIZER_NAME,
        vocab_size=10000,
        model_type='bpe'
    )

Use the trained model to tokenize the files.

In [None]:
persian_tokenizer = spm.SentencePieceProcessor()
persian_tokenizer.load('persian_bpe.model')

print(persian_tokenizer.encode_as_pieces('سلام به دنیای پردازش زبان طبیعی!'))

In [None]:
english_tokenizer = spm.SentencePieceProcessor()
english_tokenizer.load('english_bpe.model')

print(english_tokenizer.encode_as_pieces('Hello Natural Language Processing world!'))

In [None]:
train_df['persian_tokenized'] = train_df['persian'].apply(persian_tokenizer.encode_as_pieces)
test_df['persian_tokenized'] = test_df['persian'].apply(persian_tokenizer.encode_as_pieces)
eval_df['persian_tokenized'] = eval_df['persian'].apply(persian_tokenizer.encode_as_pieces)

train_df['english_tokenized'] = train_df['english'].apply(persian_tokenizer.encode_as_pieces)
test_df['english_tokenized'] = test_df['english'].apply(persian_tokenizer.encode_as_pieces)
eval_df['english_tokenized'] = eval_df['english'].apply(persian_tokenizer.encode_as_pieces)

TOKENIZED_DIR = 'tokenized_data'
if not os.path.exists(TOKENIZED_DIR):
    os.mkdir(TOKENIZED_DIR)

train_df['persian_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'train.fa'))
train_df['english_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'train.en'))

test_df['persian_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'test.fa'))
test_df['english_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'test.en'))

eval_df['persian_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'valid.fa'))
eval_df['english_tokenized'].to_csv(os.path.join(TOKENIZED_DIR, 'valid.en'))

In [None]:
!fairseq-preprocess --source-lang en --target-lang fa \
  --trainpref tokenized_data/train --validpref tokenized_data/valid --testpref tokenized_data/test \
  --destdir data-bin --workers 20 \
  --nwordssrc 10000 --nwordstgt 10000

## Training LSTM Encoder-Decoder model

Now that we have our data preprocessed and ready, let's train out lstm encoder-decoder model.

In [None]:
!fairseq-train data-bin \
  --arch lstm --encoder-bidirectional \
  --encoder-layers 6 --decoder-layers 6 \
  --optimizer adam --adam-betas '(0.9, 0.98)' --lr 0.001 \
  --max-tokens 4000 \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
  --save-dir checkpoints/lstm \
  --tensorboard-logdir logs/lstm \
  --max-epoch 5

In [None]:
!fairseq-train data-bin \
  --arch transformer --encoder-layers 6 --decoder-layers 6 \
  --optimizer sgd --momentum 0.99 --nesterov --lr 0.001 \
  --max-tokens 4000 \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
  --save-dir checkpoints/transformer \
  --tensorboard-logdir logs/transformer \
  --max-epoch 5

## Qualifying and Testing the Models

Use `fairseq-generate` to get test results of our models.

In [None]:
!fairseq-generate data-bin \
  --path checkpoints/lstm/checkpoint_best.pt \
  --beam 5 \
  --batch-size 32 \
  --remove-bpe > lstm-result.txt

In [None]:
!fairseq-generate data-bin \
  --path checkpoints/transformer/checkpoint_best.pt \
  --beam 5 \
  --batch-size 32 \
  --remove-bpe > transformer-result.txt

We'll also use `unbabel-comet` to show the comet metric on out trained models.

In [None]:
!pip install unbabel-comet

In [None]:
def extract_hypotheses(fairseq_output_file, hypothesis_file, source_file, target_file):
    with open(fairseq_output_file, 'r') as infile, open(hypothesis_file, 'w') as hypothesis, open(source_file, 'w') as source, open(target_file, 'w') as target:
        for line in infile:
            if line.startswith('H-'):
                hypothesis_line = line.split('\t')[2]
                hypothesis.write(hypothesis_line)
            elif line.startswith('S-'):
                source_line = line.split('\t')[1]
                source.write(source_line)
            elif line.startswith('T-'):
                target_line = line.split('\t')[1]
                target.write(target_line)

In [None]:
extract_hypotheses('lstm-result.txt', 'lstm-hypothesis.txt', 'lstm-source.txt', 'lstm-target.txt')
extract_hypotheses('transformer-result.txt', 'transformer-hypothesis.txt', 'transformer-source.txt', 'transformer-target.txt')

In [None]:
!comet-score -s lstm-source.txt -t lstm-hypothesis.txt -r lstm-target.txt
!comet-score -s transformer-source.txt -t transformer-hypothesis.txt -r transformer-target.txt