# Enhancing Machine Translation of News: Japanese to English Translation

## Experimental part - Not really meant to be run

### Imports

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MBartForConditionalGeneration, MBart50TokenizerFast
import torch
import evaluate
from tqdm import tqdm
import pandas as pd
from datasets import Dataset, load_from_disk

In [None]:
### ======================================================================
### OPTIONAL IMPORTS, SEE REPORT FOR DETAILS
### ======================================================================
# from googletrans import Translator
#
# import spacy
# import Mykytea
# import MeCab
#
# from sumy import Summarizer
# from sumy.parsers.plaintext import PlaintextParser
# from sumy.nlp.tokenizers import Tokenizer
# from sumy.nlp.stemmers import Stemmer
# from sumy.utils import get_stop_words
# from sumy.summarizers.lsa import LsaSummarizer
### ======================================================================

### Configuration

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pd.set_option('display.max_colwidth', 1000)

cache_dir = '/work/victota/.cache'                                # Because I lacked space in my main disk
dataset_dir = '/work/victota/traintmp'                            # Directory where the dataset is stored, as dataset.Dataset object
dataframe_dir = '/work/victota/dataframe'                         # Directory where the dataframe is stored, as pandas.DataFrame object
max_length = 256                                                  # Maximum length of input sequence and output sequence for tokenizer and model generation
num_beams = 4                                                     # Number of beams for beam search during model generation, higher is slower but sometimes better in quality

tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ja-en', cache_dir=cache_dir)                     # Will be MarianTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-ja-en', cache_dir=cache_dir).to(device)      # Will be MarianMTModel


### ======================================================================
### OPTIONAL, SEE REPORT FOR DETAILS
### ======================================================================
# tokenizer2 = MBart50TokenizerFast.from_pretrained('facebook/mbart-large-50', cache_dir=cache_dir, src_lang="ja_XX", tgt_lang="en_XX")               
# model2 = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50', cache_dir=cache_dir).to(device)
# mk = Mykytea.Mykytea("-model /work/victota/TDT4310/Helsinki/jp-0.4.7-1.mod")
# nlp = spacy.load("ja_core_news_md")
# mecab = MeCab.Tagger("-Owakati")
# translator = Translator()
### ======================================================================

### Functions

In [11]:
def translate_text(model, tokenizer, input_text, is_split_into_words: bool):
    """
    Translate input text using model and tokenizer

    Parameters:
        model: Model to use for translation
        tokenizer: Tokenizer to use for translation
        input_text: Text to translate
        is_split_into_words: If input text is already segmented

    Returns:
        Translated text
    """
    # Tokenize the input text and convert it to tensors (IDs)
        # Will both SEGMENT and ENCODE if is_split_into_words is False
        # Will only ENCODE if is_split_into_words is True
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length, is_split_into_words=is_split_into_words).to(device)
    
    # Generate translation using model
    translated = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length, num_beams=num_beams, early_stopping=True)

    # Decode the output tokens to text
    decoded_output = tokenizer.decode(translated[0], skip_special_tokens=True)

    return decoded_output


def calculate_bleu_score(predictions, references, max_order=4):
    """
    Compute BLEU score for predictions and references

    Parameters:
        predictions: List of predicted translations
        references: List of reference translations
        max_order: Maximum order of n-grams to consider, default is 4

    Returns:
        BLEU score for the predictions and references
    """
    bleu = evaluate.load("bleu", cache_dir=cache_dir)
    return bleu.compute(predictions=predictions, references=references, max_order=max_order)

def calculate_rouge_score(predictions, references):
    """
    Compute ROUGE score for predictions and references

    Parameters:
        predictions: List of predicted translations
        references: List of reference translations

    Returns:
        ROUGE score for the predictions and references
    """
    rouge = evaluate.load("rouge", cache_dir=cache_dir)
    return rouge.compute(predictions=predictions, references=references)

def calculate_chrf_score(predictions, references):
    """
    Compute chrF score for predictions and references

    Parameters:
        predictions: List of predicted translations
        references: List of reference translations

    Returns:
        chrF score for the predictions and references
    """
    chrf = evaluate.load("chrf", cache_dir=cache_dir)
    return chrf.compute(predictions=predictions, references=references)

def calculate_bleurt_score(predictions, references):
    """
    Compute Bleurt score for predictions and references
    Could be used for evaluating translations, but we didn't find the Bleurt scores of WMT23,
    so couldn't compare

    Parameters:
        predictions: List of predicted translations
        references: List of reference translations

    Returns:
        Bleurt score for the predictions and references
    """
    bleurt = evaluate.load("bleurt", cache_dir=cache_dir)
    return bleurt.compute(predictions=predictions, references=references)

def calculate_comet_score(sources, predictions, references):
    """
    Compute COMET score for predictions and references

    Parameters:
        sources: List of source translations
        predictions: List of predicted translations
        references: List of reference translations

    Returns:
        COMET score for the predictions and references
    """
    comet = evaluate.load("comet", cache_dir=cache_dir)
    return comet.compute(sources=sources, predictions=predictions, references=references)


def contains_chinese_jap(sentence):
    """
    Determine if the sentence contains Chinese or Japanese characters

    Parameters:
        sentence: Sentence

    Returns:
        Bool 
    """
    for c in sentence:
        if (ord(c) >= 0x4E00 and ord(c) <= 0x9FFF) or (ord(c) >= 0x3400 and ord(c) <= 0x4DFF) or (ord(c) >= 0x20000 and ord(c) <= 0x2A6DF) or (ord(c) >= 0x2A700 and ord(c) <= 0x2B73F) or (ord(c) >= 0x2B740 and ord(c) <= 0x2B81F) or (ord(c) >= 0x2B820 and ord(c) <= 0x2CEAF) or (ord(c) >= 0xF900 and ord(c) <= 0xFAFF):
            return True
    return False

### Load data

Note: The dataset used was originally preprocessed for Mistral 7B fine-tuning, with "text" as the prompt. It is here not needed.

In [14]:
# data = load_from_disk(dataset_dir)
# df = data.to_pandas()
# df.drop(["text"], axis=1, inplace=True)
# df.dropna(inplace=True)
# data = df.sample(150000, random_state=42)
# data.reset_index(drop=True, inplace=True)

In [15]:
# data.drop(data[data['jp'].str.contains('[a-zA-Z]')].index, inplace=True)        # Remove lines with English characters in Japanese sentences
# data = data[data.apply(lambda x: not contains_chinese_jap(x['en']), axis=1)]    # Remove lines with Chinese or Japanese characters in English sentences

In [16]:
# data.head()

Unnamed: 0,en,jp
0,The company is targeting to increase the number of female staff in managerial positions in various business areas in the medium- to long-term.,今後、あらゆる領域で更に女性が活 躍し、管理職を担う女性が中長期的に増加することをめざしています。
1,Today there are an overwhelming number of payment methods available.,銀行送金に対応している全体的なブックメーカーの数はまだまだ少ないのが現状です。
2,"There are many pieces that are simple, yet having a fairy-tale like ambience, so I think that anyone will be able to easily appreciate the works.",シンプルながらも童話のような感じを与える作品が多く、誰でも簡単に共感できると感じます。
3,Another area of private standard-setting to emerge in the late 1990’s concerned social or sustainability reporting.,1990 年代後半に民間の規格設定が出現した別の領域は、社会的又は持続的発展報告 に係わるものであった。
4,As the last bit of light fades enjoy the scenery as you sail past ancient communities of fishermen that have lived for generations in floating villages on the water.,光の最後のビットは、あなたが水上の浮遊村で世代のために住んでいる古代の漁師のコミュニティを越えて航海するとき、景色を楽しむようになります。


In [17]:
def summarize_dataset_jp(data):
    lang = "japanese"
    stemmer = Stemmer(lang)
    tokenizer = Tokenizer(lang)
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    data["summarized_jp"] = ""
    for _, row in tqdm(data.iterrows(), total=len(data)):
        text = row['jp']
        parser = PlaintextParser.from_string(text, tokenizer)
        summary_sentences = summarizer(parser.document, 1)
        data.at[_, "summarized_jp"] = " ".join(str(sentence) for sentence in summary_sentences) # In case more than 1
    return data

def summarize_dataset_en(data):
    lang = "english"
    stemmer = Stemmer(lang)
    tokenizer = Tokenizer(lang)
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    data["summarized_en"] = ""
    for _, row in tqdm(data.iterrows(), total=len(data)):
        text = row['en']
        parser = PlaintextParser.from_string(text, tokenizer)
        summary_sentences = summarizer(parser.document, 1)
        data.at[_, "summarized_en"] = " ".join(str(sentence) for sentence in summary_sentences) # In case more than 1
    return data

In [11]:
# summarize = summarize_dataset_jp(data)
# summarize.head()

Unnamed: 0,en,jp,summarized_jp
0,The company is targeting to increase the number of female staff in managerial positions in various business areas in the medium- to long-term.,今後、あらゆる領域で更に女性が活 躍し、管理職を担う女性が中長期的に増加することをめざしています。,今後、あらゆる領域で更に女性が活 躍し、管理職を担う女性が中長期的に増加することをめざしています。
1,Today there are an overwhelming number of payment methods available.,銀行送金に対応している全体的なブックメーカーの数はまだまだ少ないのが現状です。,銀行送金に対応している全体的なブックメーカーの数はまだまだ少ないのが現状です。
2,"There are many pieces that are simple, yet having a fairy-tale like ambience, so I think that anyone will be able to easily appreciate the works.",シンプルながらも童話のような感じを与える作品が多く、誰でも簡単に共感できると感じます。,シンプルながらも童話のような感じを与える作品が多く、誰でも簡単に共感できると感じます。
3,Another area of private standard-setting to emerge in the late 1990’s concerned social or sustainability reporting.,1990 年代後半に民間の規格設定が出現した別の領域は、社会的又は持続的発展報告 に係わるものであった。,1990 年代後半に民間の規格設定が出現した別の領域は、社会的又は持続的発展報告 に係わるものであった。
4,As the last bit of light fades enjoy the scenery as you sail past ancient communities of fishermen that have lived for generations in floating villages on the water.,光の最後のビットは、あなたが水上の浮遊村で世代のために住んでいる古代の漁師のコミュニティを越えて航海するとき、景色を楽しむようになります。,光の最後のビットは、あなたが水上の浮遊村で世代のために住んでいる古代の漁師のコミュニティを越えて航海するとき、景色を楽しむようになります。


## Main Function

In [18]:
test = pd.read_csv("/work/victota/TDT4310/Helsinki/translated_no_eng_nocnjp_google.csv", header=0)

In [None]:
def translation(model, tokenizer, data, segmentation= None):
    """
    Translate the data using the model and tokenizer

    Parameters:
        model: Model to use for translation
        tokenizer: Tokenizer to use for translation
        data: Data to translate
        segmentation: Segmentation method to use for translation, can be [None, "mecab", "mykytea", "spacy"]

    Returns:
        Data with translated text
    """
    data["prediction"] = ""
    print("Len: ", len(data))
    for _, row in tqdm(data.iterrows(), total=len(data)):
        # input_text = ""                                        # Optional segmentation
        # if segmentation == "mecab":
        #     input_text = mecab.parse(row['jp'])
        # elif segmentation == "mykytea":
        #     tokens = mk.getWS(row['jp'])
        #     for token in tokens:
        #         input_text += token + " "
        # elif segmentation == "spacy":
        #     doc = nlp(row['jp'])
        #     for token in doc:
        #         input_text += token.text + " "
        # else:
        #     input_text = row['jp']
        input_text = row['jp']
        prediction = translate_text(model, tokenizer, input_text, is_split_into_words= segmentation in ["mecab", "mykytea", "spacy"])

        data.at[_, 'prediction'] = prediction

    bleu_score = calculate_bleu_score(data['prediction'].tolist(), data['en'].tolist())
    rouge_score = calculate_rouge_score(data['prediction'].tolist(), data['en'].tolist())
    chrf_score = calculate_chrf_score(data['prediction'].tolist(), data['en'].tolist())
    bleurt_score = calculate_bleurt_score(data['prediction'].tolist(), data['en'].tolist())
    comet_score = calculate_comet_score(data['jp'].tolist(), data['prediction'].tolist(), data['en'].tolist())
    print(f"BLEU score: {bleu_score}")
    print(f"ROUGE score: {rouge_score}")
    print(f"CHRF score: {chrf_score}")
    print(f"Bleurt score: {sum(bleurt_score['scores']) / len(bleurt_score['scores'])}")
    print(f"COMET score: {comet_score['mean_score']}")
    return data


translated = translation(model, tokenizer, test)  

In [None]:
# translated.to_csv("XXXXXXX.csv", index=False, header=True)

In [8]:
def translation_google(data):
    """
    Translate the data using Google Translate

    Parameters:
        data: Dataframe to translate

    Returns:
        Dataframe with translated text
    """
    for _, row in tqdm(data.iterrows(), total=len(data)):
        if data["google_translation"].isna().iloc[_] == False:             # Skip if already translated
            continue
        input_text = row['jp']
        translation = translator.translate(input_text, src='ja', dest='en')

        data.at[_, 'google_translation'] = translation.text
    return data

In [20]:
# test = translation_google(test)
test.head()

Unnamed: 0,en,jp,google_translation,prediction
0,The company is targeting to increase the number of female staff in managerial positions in various business areas in the medium- to long-term.,今後、あらゆる領域で更に女性が活 躍し、管理職を担う女性が中長期的に増加することをめざしています。,"Going forward, we aim to encourage more women to play an active role in all fields, and to increase the number of women in managerial positions over the medium to long term.","From now on, women in all walks of life and in all walks of life will be able to increase in the middle-term women who are more active and more active and more executive."
1,Today there are an overwhelming number of payment methods available.,銀行送金に対応している全体的なブックメーカーの数はまだまだ少ないのが現状です。,The current situation is that the overall number of bookmakers that support bank transfers is still small.,The total number of bookmakers corresponding to bank transfers is still small.
2,"There are many pieces that are simple, yet having a fairy-tale like ambience, so I think that anyone will be able to easily appreciate the works.",シンプルながらも童話のような感じを与える作品が多く、誰でも簡単に共感できると感じます。,"Although many of the works are simple, they have a fairy tale-like feel, and I feel that anyone can easily relate to them.","There are many simple works that give you the feeling of being a fairy tale, and I find it easy to sympathize with anyone."
3,Another area of private standard-setting to emerge in the late 1990’s concerned social or sustainability reporting.,1990 年代後半に民間の規格設定が出現した別の領域は、社会的又は持続的発展報告 に係わるものであった。,Another area where private standard-setting emerged in the late 1990s was in relation to social or sustainable development reporting.,Another area where civilian standards began in the late 1990s was involved in social or sustainable progress reports.
4,As the last bit of light fades enjoy the scenery as you sail past ancient communities of fishermen that have lived for generations in floating villages on the water.,光の最後のビットは、あなたが水上の浮遊村で世代のために住んでいる古代の漁師のコミュニティを越えて航海するとき、景色を楽しむようになります。,The last bit of light will make you enjoy the scenery as you sail past communities of ancient fishermen who have lived for generations in floating villages on the water.,The last bit of light comes to enjoy the scenery when you sail across the ancient fishing community where you live for generations in floating villages on water.
