In [13]:
import requests
import re
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [14]:
# Define the DeepL API Key (you should replace it with your actual API key)
DEEP_L_API_KEY = '012cd87c-e430-40e9-a3cd-0db49dc41a50:fx'

# DeepL Translation API endpoint
DEEPL_URL = 'https://api-free.deepl.com/v2/translate'

In [15]:
# Define the backtranslation function
def backtranslate_single(dia, src_lang='KO', target_lang='EN'):
    try:
        # Step 1: Translate from the source language to the target language
        params = {
            'auth_key': DEEP_L_API_KEY,
            'text': dia,
            'source_lang': src_lang,
            'target_lang': target_lang
        }
        response = requests.post(DEEPL_URL, data=params)
        translate_text = response.json()['translations'][0]['text']

        # Step 2: Back-translate from target language to source language
        params['text'] = translate_text
        params['source_lang'] = target_lang
        params['target_lang'] = src_lang
        response = requests.post(DEEPL_URL, data=params)
        back_translate_text = response.json()['translations'][0]['text']

        return back_translate_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

In [16]:
def process_batch(batch, src_lang='KO', target_lang='EN'):
    translated_batch = []
    for text in batch:
        backtranslated_text = backtranslate_single(text, src_lang, target_lang)
        if backtranslated_text is None:  # 실패한 경우 기본값 추가
            backtranslated_text = "[Translation Failed]"
        translated_batch.append(backtranslated_text)
    return translated_batch

In [17]:
def split_speaker_and_text(dialogue):
    # Assume dialogue format is like 'Speaker: Text'
    pairs = []
    for line in dialogue.split('\n'):
        if ': ' in line:
            speaker, text = line.split(': ', 1)
            pairs.append((speaker, text))
    return pairs

def join_speaker_and_text(pairs):
    # Join speaker-text pairs into the original format
    return '\n'.join([f"{speaker}: {text}" for speaker, text in pairs])

In [18]:
def backtranslate_batch(df, batch_size=100, num_threads=10, src_lang='KO', target_lang='EN'):
    augmented_rows = []  # List to store the augmented rows
    except_list = []  # Store failed translations

    # Extract dialogues from the dataframe
    dialogues = df['dialogue'].tolist()

    # Create a ThreadPoolExecutor to process dialogues concurrently
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures_dialogue = []

        # Split the dialogues into batches for parallel processing
        for i in range(0, len(dialogues), batch_size):
            dialogue_batch = dialogues[i:i+batch_size]
            futures_dialogue.append(executor.submit(process_batch, dialogue_batch, src_lang, target_lang))

        # Collect results from all batches
        all_results_dialogue = []
        for future in tqdm(futures_dialogue):
            all_results_dialogue.extend(future.result())

    # Add back-translated dialogues to the dataframe
    for i in range(len(df)):
        backtranslated_dialogue = all_results_dialogue[i]

        if backtranslated_dialogue:  # If translation is successful
            # Split the original dialogue into speaker-text pairs
            speaker_text_pairs = split_speaker_and_text(df.loc[i, 'dialogue'])

            # Only back-translate the text portion, leaving the speaker intact
            backtranslated_pairs = []
            for speaker, text in speaker_text_pairs:
                backtranslated_text = backtranslate_single(text, src_lang, target_lang)
                if backtranslated_text:
                    backtranslated_pairs.append((speaker, backtranslated_text))
                else:
                    backtranslated_pairs.append((speaker, "[Translation Failed]"))

            # Join back the translated speaker-text pairs
            backtranslated_dialogue = join_speaker_and_text(backtranslated_pairs)

            # Create a new row with back-translated dialogue
            new_row = df.loc[i].copy()
            new_row['dialogue'] = backtranslated_dialogue
            # Summary remains unchanged
            new_row['summary'] = df.loc[i, 'summary']
            augmented_rows.append(new_row)
        else:
            except_list.append(i)  # Track failures

    # Create a new DataFrame from the augmented rows
    augmented_df = pd.DataFrame(augmented_rows)

    # Show which translations failed
    if except_list:
        print(f'Failed to translate at indices: {except_list}')

    # Return the augmented DataFrame
    return augmented_df

In [19]:
# Read the original CSV file
df = pd.read_csv("/root/NLP/data/final/all_train.csv")

In [21]:
df.shape

(14956, 4)

In [9]:
# Calculate the midpoint
midpoint = len(df) // 2

# Split the data into two halves
first_half = df.iloc[:midpoint]
second_half = df.iloc[midpoint:]

In [10]:
first_half.head()

Unnamed: 0,fname,dialogue,summary,topic
0,train_0,"#Person1#: 안녕하세요, 스미스씨. 저는 호킨스 의사입니다. 오늘 왜 오셨나...","스미스씨가 건강검진을 받고 있고, 호킨스 의사는 매년 건강검진을 받는 것을 권장합니...",건강검진 받기
1,train_1,"#Person1#: 안녕하세요, 파커 부인, 어떻게 지내셨나요?\n#Person2#...",파커 부인이 리키를 데리고 백신 접종을 하러 갔다. 피터스 박사는 기록을 확인한 후...,백신
2,train_2,"#Person1#: 실례합니다, 열쇠 한 묶음 보셨나요?\n#Person2#: 어떤...","#Person1#은 열쇠 한 묶음을 찾고 있고, 그것을 찾기 위해 #Person2#...",열쇠 찾기
3,train_3,#Person1#: 왜 너는 여자친구가 있다는 걸 말해주지 않았어?\n#Person...,#Person1#은 #Person2#가 여자친구가 있고 그녀와 결혼할 것이라는 사실...,여자친구가 있다
4,train_4,"#Person1#: 안녕, 숙녀분들! 오늘 밤 당신들은 정말 멋져 보여. 이 춤을 ...",말릭이 니키에게 춤을 요청한다. 말릭이 발을 밟는 것을 신경 쓰지 않는다면 니키는 ...,댄스


In [11]:
# Process the dataset for backtranslation
backtranslated_df = backtranslate_batch(first_half.head())

  0%|          | 0/1 [00:00<?, ?it/s]

An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'


100%|██████████| 1/1 [00:04<00:00,  4.66s/it]

An error occurred during translation: 'translations'





An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'transla

In [12]:
backtranslated_df

Unnamed: 0,fname,dialogue,summary,topic
0,train_0,#Person1#: [Translation Failed]\n#Person2#: [T...,"스미스씨가 건강검진을 받고 있고, 호킨스 의사는 매년 건강검진을 받는 것을 권장합니...",건강검진 받기
1,train_1,#Person1#: [Translation Failed]\n#Person2#: [T...,파커 부인이 리키를 데리고 백신 접종을 하러 갔다. 피터스 박사는 기록을 확인한 후...,백신
2,train_2,#Person1#: [Translation Failed]\n#Person2#: [T...,"#Person1#은 열쇠 한 묶음을 찾고 있고, 그것을 찾기 위해 #Person2#...",열쇠 찾기
3,train_3,#Person1#: [Translation Failed]\n#Person2#: [T...,#Person1#은 #Person2#가 여자친구가 있고 그녀와 결혼할 것이라는 사실...,여자친구가 있다
4,train_4,#Person1#: [Translation Failed]\n#Person2#: [T...,말릭이 니키에게 춤을 요청한다. 말릭이 발을 밟는 것을 신경 쓰지 않는다면 니키는 ...,댄스


In [None]:
# Process the dataset for backtranslation
backtranslated_df = backtranslate_batch(first_half)

# Save the back-translated dataset to a new CSV file
backtranslated_df.to_csv("/root/NLP/data/backtranslated_train.csv", index=False)

  0%|          | 0/63 [00:00<?, ?it/s]

An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'transla

  0%|          | 0/63 [00:11<?, ?it/s]

An error occurred during translation: 'translations'
An error occurred during translation: 'translations'





An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'translations'
An error occurred during translation: 'transla