In [None]:
!pip install sentencepiece sacremoses -q

In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [None]:
def load_and_preview(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    print(f"Total lines in {file_path}: {len(lines)}")
    print("First three lines:", lines[:3])
    return lines

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import zipfile

file_path = '/content/NLP_CA5_raw.data/en-fa.txt (1).zip'


with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')


In [None]:
def load_and_preview(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    print(f"Total lines in {file_path}: {len(lines)}")
    print("First three lines:", lines[:3])
    return lines

In [None]:
file_path_en = '/content/MIZAN.en-fa.en'
file_path_fa = '/content/MIZAN.en-fa.fa'
english_lines = load_and_preview(file_path_en)
farsi_lines = load_and_preview(file_path_fa)

In [None]:
def tokenize_and_histogram(lines, language):
    token_counts = [len(line.split()) for line in lines]
    plt.hist(token_counts, bins=30, alpha=0.7)
    plt.title(f'Token count histogram for {language}')
    plt.xlabel('Number of tokens')
    plt.ylabel('Number of lines')
    plt.show()
    return token_counts

en_token_counts = tokenize_and_histogram(english_lines, 'English')
fa_token_counts = tokenize_and_histogram(farsi_lines, 'Farsi')

In [None]:
def filter_data(en_lines, fa_lines, en_token_counts, fa_token_counts):
    filtered_en = []
    filtered_fa = []
    for en, fa, en_count, fa_count in zip(en_lines, fa_lines, en_token_counts, fa_token_counts):
        if 10 <= fa_count <= 50:
            filtered_en.append(en)
            filtered_fa.append(fa)
    print(f"Number of rows after filtering: {len(filtered_en)}")
    return filtered_en, filtered_fa

filtered_english, filtered_farsi = filter_data(english_lines, farsi_lines, en_token_counts, fa_token_counts)

In [None]:
def shuffle_and_split(en_data, fa_data, seed=42):
    en_data, fa_data = shuffle(en_data, fa_data, random_state=seed)
    train_en, valid_en, test_en = en_data[:500000], en_data[500000:505000], en_data[505000:515000]
    train_fa, valid_fa, test_fa = fa_data[:500000], fa_data[500000:505000], fa_data[505000:515000]
    return train_en, valid_en, test_en, train_fa, valid_fa, test_fa

train_en, valid_en, test_en, train_fa, valid_fa, test_fa = shuffle_and_split(filtered_english, filtered_farsi)

In [None]:
def write_to_files(data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.writelines(data)

output_folder = '/content/'
write_to_files(train_en, f'{output_folder}train.en')
write_to_files(valid_en, f'{output_folder}valid.en')
write_to_files(test_en, f'{output_folder}test.en')
write_to_files(train_fa, f'{output_folder}train.fa')
write_to_files(valid_fa, f'{output_folder}valid.fa')
write_to_files(test_fa, f'{output_folder}test.fa')

print("Data processing complete and files saved.")

Second

In [None]:
!git clone https://github.com/pytorch/fairseq
!cd fairseq && pip install --editable ./

In [None]:
import sentencepiece as spm

In [None]:
def bpe_tokenizer(input, model_prefix, vocab = 10000):
  spm.SentencePieceTrainer.Train(f'--input={input} --model_prefix={model_prefix} --vocab_size={vocab} --model_type=bpe')

bpe_tokenizer('/content/train.fa', 'bpe_persian')
bpe_tokenizer('/content/train.en', 'bpe_english')

In [None]:
import sentencepiece as spm

def encode_file(sp_model, input_file, output_file):
    sp = spm.SentencePieceProcessor(model_file=sp_model)

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            encoded_line = sp.encode(line, out_type=str)
            outfile.write(' '.join(encoded_line) + '\n')

# Encode files
encode_file('/content/bpe_persian.model', '/content/train.fa', '/content/train.bpe.fa')
encode_file('/content/bpe_persian.model', '/content/valid.fa', '/content/valid.bpe.fa')
encode_file('/content/bpe_persian.model', '/content/test.fa', '/content/test.bpe.fa')

encode_file('/content/bpe_english.model', '/content/train.en', '/content/train.bpe.en')
encode_file('/content/bpe_english.model', '/content/valid.en', '/content/valid.bpe.en')
encode_file('/content/bpe_english.model', '/content/test.en', '/content/test.bpe.en')


#### Preprocess Data Using Fairseq


In [None]:
!fairseq-preprocess --source-lang fa --target-lang en \
    --trainpref /content/train.bpe --validpref /content/valid.bpe --testpref /content/test.bpe \
    --destdir /content/data-bin/ \
    --nwordssrc 10000 --nwordstgt 10000

#### Model Training LSTM Encoder-Decoder

In [None]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train /content/data-bin/ \
    --arch lstm --encoder-layers 6 --decoder-layers 6 \
    --optimizer adam --adam-betas '(0.9, 0.98)' \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
    --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-7 \
    --dropout 0.1 --weight-decay 0.0001 \
    --max-tokens 1000 --batch-size 32 \
    --save-dir /content/checkpoints/ \
    --max-epoch 5 --save-interval 1 --keep-best-checkpoints 1 \
    --no-epoch-checkpoints \
    --tensorboard-logdir /content/tensorboard_logs/ \
    --fp16

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/content/tensorboard_logs'

Third

In [None]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train /content/data-bin/ \
    --arch transformer --encoder-layers 6 --decoder-layers 6 \
    --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' \
    --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-7 \
    --dropout 0.1 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
    --max-tokens 1000 \
    --save-dir /content/checkpoints_transformer/ \
    --max-epoch 5 --save-interval 1 --keep-best-checkpoints 1 \
    --tensorboard-logdir /content/tensorboard_logs_transformer/ \
    --fp16


In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/tensorboard_logs_transformer/

fourth

In [None]:
!pip install unbabel-comet

In [None]:
!fairseq-generate /content/data-bin/ \
    --path /content/checkpoints/checkpoint_best.pt \
    --batch-size 64 --beam 5 \
    --remove-bpe \
    > lstm_output.txt

!fairseq-generate /content/data-bin/ \
    --path /content/checkpoints_transformer/checkpoint_best.pt \
    --batch-size 64 --beam 5 \
    --remove-bpe \
    > transformer_output.txt

In [None]:
from comet import download_model, load_from_checkpoint

model_path = download_model("wmt20-comet-da")
comet_model = load_from_checkpoint(model_path)

def evaluate_with_comet(source_file, target_file, hypothesis_file, comet_model):
    with open(source_file, 'r') as src, open(target_file, 'r') as tgt, open(hypothesis_file, 'r') as hyp:
        sources = src.readlines()
        targets = tgt.readlines()
        hypotheses = hyp.readlines()

    data = [{"src": src.strip(), "mt": hyp.strip(), "ref": tgt.strip()}
            for src, hyp, tgt in zip(sources, hypotheses, targets)]


    scores = comet_model.predict(data, batch_size=32, gpus=1)
    return scores

# Example of procced
comet_scores_lstm = evaluate_with_comet('/content/test.en', '/content/test.fa', 'lstm_output.txt', comet_model)
comet_scores_transformer = evaluate_with_comet('/content/test.en', '/content/test.fa', 'transformer_output.txt', comet_model)

print("COMET scores for LSTM model:", comet_scores_lstm)
print("COMET scores for Transformer model:", comet_scores_transformer)

In [None]:
comet_scores_lstm['system_score']

In [None]:
comet_scores_transformer['system_score']