In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import glob
import json
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from pathlib import Path
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing

In [None]:
!cat /content/drive/MyDrive/jeju_ppodpo/path_to_files/TL01.zip.part0 > /content/drive/MyDrive/jeju_ppodpo/path_to_files/file1.zip
!cat /content/drive/MyDrive/jeju_ppodpo/path_to_files/TL02.zip.part0 > /content/drive/MyDrive/jeju_ppodpo/path_to_files/file2.zip

In [None]:
!unzip /content/drive/MyDrive/jeju_ppodpo/path_to_files/file1.zip -d /content/drive/MyDrive/jeju_ppodpo/path_to_extract/jeju1/
!unzip /content/drive/MyDrive/jeju_ppodpo/path_to_files/file2.zip -d /content/drive/MyDrive/jeju_ppodpo/path_to_extract/jeju2/

In [None]:
def process_single_file(json_path):
    text = Path(json_path).read_text(encoding='utf-8')
    obj = json.loads(text)
    sentences = obj['transcription']['sentences']
    records = []
    for sent in sentences:
        records.append({
            'dialect': sent['dialect'],
            'standard': sent['standard']
        })
    return records

def load_jeju_dataset_parallel(dir_paths, use_recursive=False,
                               max_workers=None, executor_type='thread'):
    all_records = []
    for dir_path in dir_paths:
        p = Path(dir_path)
        json_files = list(p.rglob('*.json')) if use_recursive else list(p.glob('*.json'))
        total = len(json_files)
        if total == 0:
            continue

        if max_workers is None:
            cpu_cnt = multiprocessing.cpu_count()
            if executor_type == 'process':
                max_workers_effective = cpu_cnt
            else:
                max_workers_effective = min(32, cpu_cnt * 5)
        else:
            max_workers_effective = max_workers

        Executor = ProcessPoolExecutor if executor_type == 'process' else ThreadPoolExecutor
        with Executor(max_workers=max_workers_effective) as executor:
            for recs in tqdm(executor.map(process_single_file, json_files),
                             total=total, desc=f"Processing {p.name}", unit="file"):
                all_records.extend(recs)

    return pd.DataFrame(all_records, columns=['dialect', 'standard'])


if __name__ == "__main__":
    base_dir = '/content/drive/MyDrive/jeju_ppodpo/path_to_extract'
    dirs = [f'{base_dir}/jeju1', f'{base_dir}/jeju2']
    df_jeju = load_jeju_dataset_parallel(dirs, use_recursive=False,
                                         max_workers=24,
                                         executor_type='thread')
    df_jeju.to_csv('/content/drive/MyDrive/jeju_ppodpo/path_to_extract/jeju_all.csv', index=False)

In [None]:
df_jeju = pd.read_csv('/content/drive/MyDrive/jeju_ppodpo/path_to_extract/jeju_all.csv')

In [None]:
df_jeju = df_jeju.sample(frac=1, random_state=42).reset_index(drop=True)

df_sft  = df_jeju.iloc[:20000].reset_index(drop=True)
df_dpo  = df_jeju.iloc[20000:55000].reset_index(drop=True)
df_test = df_jeju.iloc[55000:].reset_index(drop=True)

save_dir = Path('/content/drive/MyDrive/jeju_ppodpo/data')
save_dir.mkdir(parents=True, exist_ok=True)

df_sft.to_csv(save_dir / 'df_sft.csv',  index=False)
df_dpo.to_csv(save_dir / 'df_dpo.csv',  index=False)
df_test.to_csv(save_dir / 'df_test.csv', index=False)

In [None]:
!pip install -U langchain langchain-openai

In [None]:
import pandas as pd

df_dpo = pd.read_csv('/content/drive/MyDrive/jeju_ppodpo/data/df_dpo.csv')

In [None]:
import os
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)

# OpenAI API 키 설정
os.environ["OPENAI_API_KEY"] = "..."

# 모델 및 체인 구성
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)

# 프롬프트 (불필요 출력 방지)
human_template = (
    "제주 방언을 자연스러운 표준어로 번역하세요.\n"
    "반드시 번역된 문장만 출력하세요. '표준어:', 따옴표, 설명은 포함하지 마세요.\n"
    "제주 방언: {text}"
)
human_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])
chain = chat_prompt | model | StrOutputParser()

# 후처리 함수
def clean_output(output: str) -> str:
    output = output.strip()
    output = re.sub(r'^표준어\s*[:：]?\s*', '', output)  # '표준어:' 제거
    output = output.strip(' "\'\n')  # 따옴표 및 공백 제거
    return output

# 번역 함수 (후처리 포함)
def translate_and_clean(text: str) -> str:
    try:
        raw_output = chain.invoke({"text": text})
        return clean_output(raw_output)
    except Exception as e:
        return f"[ERROR] {e}"

# 병렬 처리로 번역 실행
results = [None] * len(df_dpo)
with ThreadPoolExecutor(max_workers=32) as executor:
    futures = {executor.submit(translate_and_clean, row): i for i, row in enumerate(df_dpo["dialect"])}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
        idx = futures[future]
        try:
            results[idx] = future.result()
        except Exception as e:
            results[idx] = f"[ERROR] {e}"

# 결과 저장
df_dpo["rejected"] = results


In [None]:
df_rew  = df_dpo.iloc[:20000].reset_index(drop=True)
df_rl  = df_dpo.iloc[20000:].reset_index(drop=True)

save_dir = Path('/content/drive/MyDrive/jeju_ppodpo/data')
save_dir.mkdir(parents=True, exist_ok=True)

df_dpo.to_csv(save_dir / 'df_dpo.csv',  index=False)
df_rew.to_csv(save_dir / 'df_rew.csv',  index=False)
df_rl.to_csv(save_dir / 'df_rl.csv', index=False)