In [1]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using", device, "device")


Using cuda device


## Load Dataset 

In [13]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("iwslt2017",'iwslt2017-zh-en')

In [14]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, pipeline

en_texts = [entry['translation']['en'] for entry in dataset['test']]
zh_texts = [entry['translation']['zh'] for entry in dataset['test']]

## Model for Translation

In [16]:
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m100_en_zh = pipeline('translation', 'facebook/m2m100_418M',device=0, src_lang='en', tgt_lang="zh")

## Translate to m2m_translated.txt (For ref only)

In [19]:
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize the translation pipeline
m2m100_en_zh = pipeline('translation', model='facebook/m2m100_418M', device=0, src_lang='en', tgt_lang='zh')

def process_batches(input_list, batch_size):
    for i in range(0, len(input_list), batch_size):
        yield input_list[i:i + batch_size]

def translate_and_write(batch):
    batch_translations = m2m100_en_zh(batch)
    
    # Write both original and translated sentences to the file
    with open("m2m_translated.txt", "a", encoding='utf-8') as fp:
        for original, translated in zip(batch, batch_translations):
            fp.write(f"{original} , \t{translated['translation_text']}\n")

# Threads
num_workers = 4  

# Translation & writing in parallel
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit tasks to executor
    futures = [executor.submit(translate_and_write, batch) for batch in process_batches(en_texts, 200)]
    for future in as_completed(futures):
        future.result()

print("All translations are completed and written to the file.")




All translations are completed and written to the file.


## Process txt file (For ref only)

In [35]:
en = []
m2m_zh = [] 
hash_map = {} 
import re

def split_eng_chinese(text):
    # Regex pattern to match English characters, digits, and common punctuation
    eng_pattern = re.compile(r'[\w\s,.\'?!;:"]+')  # Adjust the pattern as needed
    # Regex pattern to match Chinese characters (including common punctuation)
    chi_pattern = re.compile(r'[\u4e00-\u9fff，。？！；：“”（）]+')  # Add more as needed

    # Find all English and Chinese segments
    eng_segments = eng_pattern.findall(text)
    chi_segments = chi_pattern.findall(text)

    # Remove whitespace-only strings and strip leading/trailing whitespace
    eng_segments = [segment.strip() for segment in eng_segments if segment.strip() != '']
    chi_segments = [segment.strip() for segment in chi_segments if segment.strip() != '']

    return eng_segments, chi_segments

# Open the text file and read the lines
with open('m2m_translated.txt', 'r') as file:
    for line in file:
        # Strip newline characters and any trailing whitespace
        line = line.strip()
        b = line.split('\t')
        en.append(b[0][:-3])
        hash_map[b[0][:-3]] = b[1]

hash_map

{'And we know from that study  that when you want reality, you go to the needle.': '我們從這項研究中知道,當你想要現實時,你會去針子。',
 "This is a seven-inch hatpin. It's very, very sharp,  and I'm going to just sterilize it a tiny bit.": '这是一个七英寸的帽子,它非常,非常尖锐,我只会精化它一点。',
 "This is really my flesh. This is not  Damian's special-grown flesh.": '這實際上是我的肉,這不是Damian的特別生肉。',
 "That's my skin right there. This is not a Hollywood special effect.": '这就是我的皮肤,这不是好莱坞的特殊效果。',
 "I'm going to pierce my skin  and run this needle through to the other side.": '我要把我的皮肤擦去,把这条针推到另一边。',
 "If you're queasy --   if you faint easily -- I was doing this for some friends  in the hotel room last night, and some people that I didn't know,  and one woman almost passed out.": '如果你是可爱的 - 如果你轻易失败 - 我昨晚在酒店房间里做了一些朋友,有些人我不知道,一个女人几乎离开了。',
 "So, I suggest if you get queasy easy  that you look away for about the next 30 --  in fact, you know what, I'll do the first bad part behind it.": '因此,我建议,如果你得到一个轻松的,你会远离下一个30 - 事实上,你知道什么,我会做的第一个坏部分背后。',
 "Y

## Combine (For ref only)

In [36]:
df = pd.DataFrame({
    'en_test': en_texts,
    'zh_test': zh_texts
})

df 

Unnamed: 0,en_test,zh_test
0,"Several years ago here at TED, Peter Skillman ...",几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”
1,And the idea's pretty simple: Teams of four h...,是个非常简单的主意 要求一组四人的团队搭建一个独立的最高建筑 材料是20根意大利面条 一段胶...
2,The marshmallow has to be on top.,棉花糖必须放在最上面
3,"And, though it seems really simple, it's actua...",这虽然看似简单，其实并不容易 因为它要求人们 迅速地合作
4,"And so, I thought this was an interesting idea...",我觉得这是个有趣的主意 我把它放到了设计专题讨论会上
...,...,...
8544,"And the answer is, the ability to seek goals ...",答案是，寻找目标的能力 将直接遵循它， 道理是这样的： 就像你要穿过一条隧道， 你未来道路空...
8545,from a long-term drive to increase future fre...,长期的驱动， 为了增加未来的行动自由。
8546,"Finally, Richard Feynman, famous physicist, o...",最后，理查德 · 费曼， 这位著名的物理学家， 曾经写道， 如果人类文明被摧毁 并且你只能将...
8547,My equivalent of that statement to pass on to...,我与这句话等同的声明， 来传递给后代， 帮助他们建立人工智能 或是帮助他们理解 人类的智慧，...


## Match correct translation 

In [37]:
def find_translated(text):
    return hash_map[text]
df['M2M100'] = df['en_test'].apply(find_translated) # since conversion was in batches and not in order
df.to_csv('translated_benchmarks.csv', index=False)

## Run from here if translated_benchmarks.csv is already present

In [4]:
import pandas as pd
from sacrebleu.metrics import BLEU, CHRF, TER
from rouge_chinese import Rouge
import jieba # you can use any other word cutting library
import torch
from bert_score import score
current = pd.read_csv('translated_benchmarks.csv')
current

Unnamed: 0,en_test,zh_test,M2M100
0,"Several years ago here at TED, Peter Skillman ...",几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”,"几年前,在这里在TED,彼得·斯基尔曼介绍了一个设计挑战称为马斯马洛挑战。"
1,And the idea's pretty simple: Teams of four h...,是个非常简单的主意 要求一组四人的团队搭建一个独立的最高建筑 材料是20根意大利面条 一段胶...,这个想法很简单:四个团队必须从20块蜘蛛、一块绳子、一块绳子和一块火柴中建造最高的自由站式结构。
2,The marshmallow has to be on top.,棉花糖必须放在最上面,马歇尔必须在顶部。
3,"And, though it seems really simple, it's actua...",这虽然看似简单，其实并不容易 因为它要求人们 迅速地合作,"虽然它看起来很简单,但它实际上是相当困难的,因为它迫使人们非常快地合作。"
4,"And so, I thought this was an interesting idea...",我觉得这是个有趣的主意 我把它放到了设计专题讨论会上,"因此,我认为这是一个有趣的想法,我把它纳入了一个设计研讨会。"
...,...,...,...
8544,"And the answer is, the ability to seek goals ...",答案是，寻找目标的能力 将直接遵循它， 道理是这样的： 就像你要穿过一条隧道， 你未来道路空...,"答案是,追求目标的能力将直接从这个意义上跟随:就像你在未来的路线空间中穿过隧道,一个瓶子,以..."
8545,from a long-term drive to increase future fre...,长期的驱动， 为了增加未来的行动自由。,"从长期的努力,以提高未来的行动自由。"
8546,"Finally, Richard Feynman, famous physicist, o...",最后，理查德 · 费曼， 这位著名的物理学家， 曾经写道， 如果人类文明被摧毁 并且你只能将...,"最后,著名的物理学家理查德·费恩曼(Richard Feynman)曾经写过,如果人类文明被..."
8547,My equivalent of that statement to pass on to...,我与这句话等同的声明， 来传递给后代， 帮助他们建立人工智能 或是帮助他们理解 人类的智慧，...,"我的同等于把这个声明传递给后代,帮助他们建立人工智能或帮助他们理解人类智能,是如下:智能应该..."


# Calc Scores

In [81]:

def calc_bleu_scores(row): 
    r , c = row['zh_test'].rstrip(), row['M2M100'].rstrip()
    sacre_ref = [[r]] # list of list of text
    sacre_c = [c] # list of text
    scores = {}
    for i in range(1, 5):
        bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=i)
        s = bleu.corpus_score(sacre_c, sacre_ref).score  
        scores[f'BLEU-{i}'] = "{:.3f}".format(s)
    # chrf 
    chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
    s = chrf.corpus_score(sacre_c, sacre_ref).score
    scores['CHRF'] = "{:.3f}".format(s)
    # chrf++ 
    chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
    s = chrf.corpus_score(sacre_c, sacre_ref).score
    scores['CHRF++'] = "{:.3f}".format(s)
    # ter 
    ter = TER(asian_support=True, normalized=True)
    s = ter.corpus_score(sacre_c, sacre_ref).score
    scores['TER'] = "{:.3f}".format(s)
    # rouge w jieba
    rouge_ref = ' '.join(jieba.cut(r))
    rouge_c = ' '.join(jieba.cut(c))
    rouge = Rouge()
    s = rouge.get_scores(rouge_c, rouge_ref)
    for m in s[0]:
        for m2 in s[0][m]:
            scores[f'{m}-{m2}'] = "{:.3f}".format(s[0][m][m2])
    # bert score 
    P, R, F = score(sacre_c, sacre_ref, lang='zh')
    P, R, F = P.item(), R.item(), F.item()
    scores['BERTSCORE-R']= "{:.3f}".format(R)
    scores['BERTSCORE-P']= "{:.3f}".format(P)
    scores['BERTSCORE-F']= "{:.3f}".format(F)

    return pd.Series(scores)


current[['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', 'CHRF', 'CHRF++', 'TER', 'ROUGE1-R','ROUGE1-P','ROUGE1-F','ROUGE2-R','ROUGE2-P','ROUGE2-F','ROUGEL-R','ROUGEL-P','ROUGEL-F','BERTSCORE-R','BERTSCORE-P','BERTSCORE-F']] = current.apply(calc_bleu_scores, axis=1)
current.to_csv('Benchmark_M2M100_Evaluations.csv',index=False)

In [82]:
current

Unnamed: 0,en_test,zh_test,M2M100,BLEU-1,BLEU-2,BLEU-3,BLEU-4,CHRF,CHRF++,TER,...,ROUGE1-F,ROUGE2-R,ROUGE2-P,ROUGE2-F,ROUGEL-R,ROUGEL-P,ROUGEL-F,BERTSCORE-R,BERTSCORE-P,BERTSCORE-F
0,"Several years ago here at TED, Peter Skillman ...",几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”,"几年前,在这里在TED,彼得·斯基尔曼介绍了一个设计挑战称为马斯马洛挑战。",45.714,40.168,33.945,29.264,27.391,23.478,63.333,...,0.462,0.300,0.273,0.286,0.476,0.435,0.455,0.831,0.770,0.800
1,And the idea's pretty simple: Teams of four h...,是个非常简单的主意 要求一组四人的团队搭建一个独立的最高建筑 材料是20根意大利面条 一段胶...,这个想法很简单:四个团队必须从20块蜘蛛、一块绳子、一块绳子和一块火柴中建造最高的自由站式结构。,33.708,17.958,6.807,3.544,8.725,7.478,84.906,...,0.235,0.000,0.000,0.000,0.133,0.138,0.136,0.700,0.718,0.709
2,The marshmallow has to be on top.,棉花糖必须放在最上面,马歇尔必须在顶部。,29.828,18.266,12.872,9.443,7.639,6.548,70.000,...,0.182,0.000,0.000,0.000,0.167,0.200,0.182,0.718,0.721,0.719
3,"And, though it seems really simple, it's actua...",这虽然看似简单，其实并不容易 因为它要求人们 迅速地合作,"虽然它看起来很简单,但它实际上是相当困难的,因为它迫使人们非常快地合作。",38.889,27.889,16.601,9.125,10.795,9.253,88.462,...,0.343,0.067,0.048,0.056,0.375,0.273,0.316,0.810,0.768,0.788
4,"And so, I thought this was an interesting idea...",我觉得这是个有趣的主意 我把它放到了设计专题讨论会上,"因此,我认为这是一个有趣的想法,我把它纳入了一个设计研讨会。",50.000,34.740,23.472,14.793,14.759,12.651,68.000,...,0.452,0.200,0.176,0.187,0.500,0.421,0.457,0.863,0.822,0.842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8544,"And the answer is, the ability to seek goals ...",答案是，寻找目标的能力 将直接遵循它， 道理是这样的： 就像你要穿过一条隧道， 你未来道路空...,"答案是,追求目标的能力将直接从这个意义上跟随:就像你在未来的路线空间中穿过隧道,一个瓶子,以...",64.105,51.488,40.693,32.915,30.541,26.178,47.321,...,0.596,0.254,0.270,0.262,0.486,0.507,0.496,0.833,0.844,0.838
8545,from a long-term drive to increase future fre...,长期的驱动， 为了增加未来的行动自由。,"从长期的努力,以提高未来的行动自由。",61.111,56.880,52.115,46.606,40.464,34.683,44.444,...,0.571,0.500,0.455,0.476,0.636,0.583,0.609,0.863,0.855,0.859
8546,"Finally, Richard Feynman, famous physicist, o...",最后，理查德 · 费曼， 这位著名的物理学家， 曾经写道， 如果人类文明被摧毁 并且你只能将...,"最后,著名的物理学家理查德·费恩曼(Richard Feynman)曾经写过,如果人类文明被...",67.126,55.713,46.144,38.640,30.832,23.124,36.885,...,0.606,0.274,0.294,0.284,0.514,0.535,0.524,0.882,0.892,0.887
8547,My equivalent of that statement to pass on to...,我与这句话等同的声明， 来传递给后代， 帮助他们建立人工智能 或是帮助他们理解 人类的智慧，...,"我的同等于把这个声明传递给后代,帮助他们建立人工智能或帮助他们理解人类智能,是如下:智能应该...",72.592,61.161,52.568,45.813,41.802,35.830,38.043,...,0.644,0.333,0.367,0.350,0.571,0.627,0.598,0.878,0.897,0.887
