In [33]:
from deep_translator import DeeplTranslator, MyMemoryTranslator, GoogleTranslator, MicrosoftTranslator
import pandas as pd
import re
import time
import random

In [40]:
from huggingface_hub import login
import importlib
import token_login
importlib.reload(token_login)
login(token= token_login.token)

In [4]:
from datasets import load_dataset

ds = load_dataset("msarmi9/korean-english-multitarget-ted-talks-task")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['korean', 'english'],
        num_rows: 166215
    })
    validation: Dataset({
        features: ['korean', 'english'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['korean', 'english'],
        num_rows: 1982
    })
})

In [6]:
# If 'dataset' is a DatasetDict, select a split first
if isinstance(ds, dict): # Check if it's a DatasetDict
    train_df = ds['train'].to_pandas() # Example: convert the 'train' split
    test_df = ds['test'].to_pandas()
    val_df = ds['validation'].to_pandas()
else: # If 'dataset' is already a Dataset object
    df = ds.to_pandas()

In [7]:
test_df.head()

Unnamed: 0,korean,english
0,아직 3분 시작된 건 아니죠? 그렇죠?,Allison Hunt: My three minutes hasn't started ...
1,"크리스 앤더슨:네, 맘대로 시작하실 수 없습니다.","Chris Anderson: No, you can't start the three ..."
2,"3분 다시 설정해주세요, 이건 반칙입니다.","Reset the three minutes, that's just not fair."
3,"앨리슨 헌트 : 어머나, 여기 참 냉정하네요","AH: Oh my God, it's harsh up here."
4,정말이지 긴장되네요,I mean I'm nervous enough as it is.


In [10]:
total_characters = sum(len(s) for s in test_df['korean'])
print(total_characters)

91236


In [11]:
langs_list = GoogleTranslator().get_supported_languages()  # output: [arabic, french, english etc...]
print(langs_list)

['afrikaans', 'albanian', 'amharic', 'arabic', 'armenian', 'assamese', 'aymara', 'azerbaijani', 'bambara', 'basque', 'belarusian', 'bengali', 'bhojpuri', 'bosnian', 'bulgarian', 'catalan', 'cebuano', 'chichewa', 'chinese (simplified)', 'chinese (traditional)', 'corsican', 'croatian', 'czech', 'danish', 'dhivehi', 'dogri', 'dutch', 'english', 'esperanto', 'estonian', 'ewe', 'filipino', 'finnish', 'french', 'frisian', 'galician', 'georgian', 'german', 'greek', 'guarani', 'gujarati', 'haitian creole', 'hausa', 'hawaiian', 'hebrew', 'hindi', 'hmong', 'hungarian', 'icelandic', 'igbo', 'ilocano', 'indonesian', 'irish', 'italian', 'japanese', 'javanese', 'kannada', 'kazakh', 'khmer', 'kinyarwanda', 'konkani', 'korean', 'krio', 'kurdish (kurmanji)', 'kurdish (sorani)', 'kyrgyz', 'lao', 'latin', 'latvian', 'lingala', 'lithuanian', 'luganda', 'luxembourgish', 'macedonian', 'maithili', 'malagasy', 'malay', 'malayalam', 'maltese', 'maori', 'marathi', 'meiteilon (manipuri)', 'mizo', 'mongolian', 'm

In [12]:
# remove speaker identification (e.g. "Bill Lange: ...")
# remove aural indicators
#r'[\uAC00-\uD7A3]+\s?[\uAC00-\uD7A3]+:\s', r'\([\uAC00-\uD7A3]+\)\s?'
pattern = [r'\w+(\s\w+)?\:\s', r'\(\w+\)\s?', r'[\\p{L}]+(\s[\\p{L}]+)?:\s', r'\([\\p{L}]+\)\s?']
remove_str = "|".join(pattern)

train_df['clean english'] = train_df['english'].str.replace(remove_str, "",regex=True)
train_df['clean korean'] = train_df['korean'].str.replace(remove_str, "", regex=True)

test_df['clean english'] = test_df['english'].str.replace(remove_str, "",regex=True)
test_df['clean korean'] = test_df['korean'].str.replace(remove_str, "", regex=True)

val_df['clean english'] = val_df['english'].str.replace(remove_str, "",regex=True)
val_df['clean korean'] = val_df['korean'].str.replace(remove_str, "", regex=True)

In [13]:
test_df.tail()

Unnamed: 0,korean,english,clean english,clean korean
1977,W: 예. 에너지와 관련한 일할 생각을 갖고 있습니다.,WK: Yeah. I'm still thinking to work on energy.,Yeah. I'm still thinking to work on energy.,예. 에너지와 관련한 일할 생각을 갖고 있습니다.
1978,"C: 윌리엄, TED에 당신을 초대하게 되어 영광입니다.","CA: Wow. William, it's a real honor to have yo...","Wow. William, it's a real honor to have you at...","윌리엄, TED에 당신을 초대하게 되어 영광입니다."
1979,와줘서 정말 고마워요.,Thank you so much for coming.,Thank you so much for coming.,와줘서 정말 고마워요.
1980,W: 감사합니다.,WK: Thank you.,Thank you.,감사합니다.
1981,(박수),(Applause),,


In [14]:
# remove empty rows
clean_train_df = train_df[train_df['clean english']!= '']
clean_test_df = test_df[test_df['clean english']!= '']
clean_val_df = val_df[val_df['clean english']!= '']
print(len(val_df))
print(len(clean_val_df))
clean_val_df.tail()

1958
1957


Unnamed: 0,korean,english,clean english,clean korean
1952,"저희 회사는 사진 7억장 보유하고 있지만,",My company has 70 million images.,My company has 70 million images.,"저희 회사는 사진 7억장 보유하고 있지만,"
1953,제 사무실에는 한 장의 사진이 있습니다.,I have one image in my office.,I have one image in my office.,제 사무실에는 한 장의 사진이 있습니다.
1954,이 사진이죠.,Here it is.,Here it is.,이 사진이죠.
1955,전 다음번에는 여러분이 여러분을 행동하게 할 사진을 만나길 바랍니다. 사진으로 이유...,I hope that the next time you see an image tha...,I hope that the next time you see an image tha...,전 다음번에는 여러분이 여러분을 행동하게 할 사진을 만나길 바랍니다. 사진으로 이유...
1956,모든 사진가들에게 감사를 드립니다.,And thank you to all the photographers.,And thank you to all the photographers.,모든 사진가들에게 감사를 드립니다.


In [15]:
# Convert the Korean column to a list
korean_texts = clean_test_df['clean korean'].tolist()

In [None]:
result_df = pd.DataFrame(columns=['korean', 'actual translation', 'Google translation'])

# Batch size for GoogleTranslator (max = 40)
BATCH_SIZE = 30

translator = GoogleTranslator(source='ko', target='en')

google_translations = []

# Process in batches
for i in range(0, len(korean_texts), BATCH_SIZE):
    batch = korean_texts[i:i + BATCH_SIZE]
    print("batch: ", i)

    for attempt in range(3):
        try:
            translated_batch = translator.translate_batch(batch)
            google_translations.extend(translated_batch)
            break
        except Exception as e:
            print(f"Batch failed (attempt {attempt+1}): {e}")
            sleep_time = 2 ** attempt + random.random()  # exponential backoff
            print(f"Sleeping {sleep_time:.2f} sec...")
            time.sleep(sleep_time)

    time.sleep(0.3) 

# DeepL translation will stay empty unless you use DeepL Pro
# result_df['DeepL translation'] = ""

# for txt in range(0, len(clean_train_df['clean korean'])):
#     translated = GoogleTranslator(source='auto', target='en').translate(text=clean_train_df['clean korean'][txt]) 
    
#     new_row_df = pd.DataFrame({'korean': [clean_train_df['clean korean'][txt]], 'actual translation':[clean_train_df['clean english'][txt]], 'Google translation':[translated]})
#     result_df = pd.concat([result_df, new_row_df], ignore_index=True)

batch:  0
Batch failed (attempt 1): ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Sleeping 1.11 sec...
batch:  30
batch:  60
batch:  90
batch:  120
batch:  150
batch:  180
batch:  210
batch:  240
batch:  270
batch:  300
batch:  330
batch:  360
batch:  390
batch:  420
batch:  450
batch:  480
batch:  510
batch:  540
batch:  570
batch:  600
batch:  630
batch:  660
batch:  690
batch:  720
batch:  750
batch:  780
batch:  810
batch:  840
batch:  870
batch:  900
batch:  930
batch:  960
batch:  990
batch:  1020
batch:  1050
batch:  1080
Batch failed (attempt 1): ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Sleeping 1.98 sec...
batch:  1110
batch:  1140
batch:  1170
batch:  1200
batch:  1230
batch:  1260
batch:  1290
batch:  1320
batch:  1350
batch:  1380
batch:  1410
batch:  1440
batch:  1470
batch:  1500
batch:  

In [174]:
# Assemble the final DataFrame
result_df['korean'] = clean_test_df['clean korean']
result_df['actual translation'] = clean_test_df['clean english']
result_df['Google translation'] = google_translations

In [None]:
# result_df.to_csv('data/korean_translation.csv', index=False)


In [176]:
pd.set_option('display.max_colwidth', None)

result_df.head()

Unnamed: 0,korean,actual translation,Google translation,DeepL translation
0,아직 3분 시작된 건 아니죠? 그렇죠?,"My three minutes hasn't started yet, has it?","The 3 minutes haven't started yet, right? Right?",
1,"크리스 앤더슨:네, 맘대로 시작하실 수 없습니다.","No, you can't start the three minutes.","Chris Anderson: Yes, you can't just start.",
2,"3분 다시 설정해주세요, 이건 반칙입니다.","Reset the three minutes, that's just not fair.","Please reset the time to 3 minutes, this is a foul.",
3,"앨리슨 헌트 : 어머나, 여기 참 냉정하네요","Oh my God, it's harsh up here.","Allison Hunt: Oh my, it's so cold in here.",
4,정말이지 긴장되네요,I mean I'm nervous enough as it is.,I'm really nervous.,


In [37]:
langs_list = MicrosoftTranslator(api_key= token_login.microsoft_key, source='ko', target='en').get_supported_languages()
print("Supported languages (list):", langs_list)

Supported languages (list): ['afrikaans', 'amharic', 'arabic', 'assamese', 'azerbaijani', 'bashkir', 'belarusian', 'bulgarian', 'bhojpuri', 'bangla', 'tibetan', 'bodo', 'bosnian', 'catalan', 'czech', 'welsh', 'danish', 'german', 'dogri', 'lower sorbian', 'divehi', 'greek', 'english', 'spanish', 'estonian', 'basque', 'persian', 'finnish', 'filipino', 'fijian', 'faroese', 'french', 'french (canada)', 'irish', 'galician', 'konkani', 'gujarati', 'hausa', 'hebrew', 'hindi', 'chhattisgarhi', 'croatian', 'upper sorbian', 'haitian creole', 'hungarian', 'armenian', 'indonesian', 'igbo', 'inuinnaqtun', 'icelandic', 'italian', 'inuktitut', 'inuktitut (latin)', 'japanese', 'georgian', 'kazakh', 'khmer', 'kurdish (northern)', 'kannada', 'korean', 'kashmiri', 'kurdish (central)', 'kyrgyz', 'luxembourgish', 'lingala', 'lao', 'lithuanian', 'ganda', 'latvian', 'chinese (literary)', 'maithili', 'malagasy', 'māori', 'macedonian', 'malayalam', 'mongolian (cyrillic)', 'mongolian (traditional)', 'manipuri',

In [41]:
# Microsoft Translator
micro_translator = MicrosoftTranslator(api_key=token_login.microsoft_key, 
                                       endpoint = token_login.microsoft_endpoint,
                                       region= token_login.region, 
                                       source= 'ko', target='en')

In [42]:
micro_translator.translate(clean_test_df['clean korean'][0])


"It's not three minutes yet, right? Right?"

In [None]:
# Batch size for Microsoft (max = 50,000 character per request; 2mill per month)
BATCH_SIZE = 30000

microsoft_translations = []

# Process in batches
for i in range(0, len(korean_texts), BATCH_SIZE):
    batch = korean_texts[i:i + BATCH_SIZE]
    print("batch: ", i)

    for attempt in range(3):
        try:
            microsoft_translated_batch = micro_translator.translate_batch(batch)
            microsoft_translations.extend(microsoft_translated_batch)
            break
        except Exception as e:
            print(f"Batch failed (attempt {attempt+1}): {e}")
            sleep_time = 2 ** attempt + random.random()  # exponential backoff
            print(f"Sleeping {sleep_time:.2f} sec...")
            time.sleep(sleep_time)

    time.sleep(0.3) 

batch:  0
Batch failed (attempt 1): Expecting value: line 1 column 1 (char 0)
Sleeping 1.79 sec...


In [49]:
# Assemble the final DataFrame
microsoft_result_df = pd.DataFrame(columns=['korean', 'actual translation', 'Microsoft translation'])
microsoft_result_df['korean'] = clean_test_df['clean korean']
microsoft_result_df['actual translation'] = clean_test_df['clean english']
microsoft_result_df['Microsoft translation'] = microsoft_translations

In [52]:
pd.set_option('display.max_colwidth', None)

microsoft_result_df.tail()

Unnamed: 0,korean,actual translation,Microsoft translation
1976,"아직 19살에 불과하지만, 당신이 생각하는 좀 더 미래의 당신은… 당신은 계속 에너지와 관련된 일을 하고 있을까요?","And as you think of your life going forward, you're 19 now, do you picture continuing with this dream of working in energy?","You're only 19 years old, but what you think of as a more future you... Will you continue to work with energy?"
1977,예. 에너지와 관련한 일할 생각을 갖고 있습니다.,Yeah. I'm still thinking to work on energy.,"Yes, I have an idea of working in energy."
1978,"윌리엄, TED에 당신을 초대하게 되어 영광입니다.","Wow. William, it's a real honor to have you at the TED conference.","William, it's an honor to invite you to TED."
1979,와줘서 정말 고마워요.,Thank you so much for coming.,Thank you so much for coming.
1980,감사합니다.,Thank you.,I appreciate it.


In [57]:
# Add Microsoft results to data
df = pd.read_csv('data/korean_translation.csv')


In [58]:
df = pd.merge(df, microsoft_result_df, on=['korean', 'actual translation'], how='left')
df = df.drop('DeepL translation', axis=1)
df.head()

Unnamed: 0,korean,actual translation,Google translation,Microsoft translation
0,아직 3분 시작된 건 아니죠? 그렇죠?,"My three minutes hasn't started yet, has it?","The 3 minutes haven't started yet, right? Right?","It's not three minutes yet, right? Right?"
1,"크리스 앤더슨:네, 맘대로 시작하실 수 없습니다.","No, you can't start the three minutes.","Chris Anderson: Yes, you can't just start.","Chris Anderson: Yes, you can't start as you want."
2,"3분 다시 설정해주세요, 이건 반칙입니다.","Reset the three minutes, that's just not fair.","Please reset the time to 3 minutes, this is a foul.","Please set it again for 3 minutes, this is a foul."
3,"앨리슨 헌트 : 어머나, 여기 참 냉정하네요","Oh my God, it's harsh up here.","Allison Hunt: Oh my, it's so cold in here.","Allison Hunt: Oh my God, it's so cold here"
4,정말이지 긴장되네요,I mean I'm nervous enough as it is.,I'm really nervous.,I'm really nervous


In [59]:
df.to_csv('data/korean_translation.csv', index=False)