In [8]:
from konlpy.tag import Okt
import pandas as pd
import random
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import os

# Okt 형태소 분석기 초기화
okt = Okt()

In [9]:
# Okt 형태소 분석기 초기화
okt = Okt()

def process_batch(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # 배치 단위로 형태소 분석
        morphs_batch = [okt.pos(text, norm=True, stem=True) for text in batch]
        results.extend(morphs_batch)
    return results

def augment_korean_text(text, num_augments=2):
    augmented_texts = set()
    morphs = okt.pos(text, norm=True, stem=True)
    
    for _ in range(num_augments * 2):  # 원하는 개수보다 더 많이 시도
        if len(augmented_texts) >= num_augments:
            break
            
        new_text = []
        changed = False
        
        for word, pos in morphs:
            if pos in ['Adjective', 'Verb'] and word in synonyms:
                if random.random() < 0.7:
                    new_word = random.choice(synonyms[word])
                    new_text.append(new_word)
                    changed = True
                else:
                    new_text.append(word)
            else:
                new_text.append(word)
        
        new_sentence = ' '.join(new_text)
        if changed and new_sentence != text:
            augmented_texts.add(new_sentence)
    
    return list(augmented_texts)

def process_chunk(chunk):
    results = []
    for _, row in chunk.iterrows():
        augmented_texts = augment_korean_text(row["입력 문장"])
        for aug_text in augmented_texts:
            results.append({
                "입력 문장": aug_text,
                "도수": row["도수"],
                "술 종류": row["술 종류"],
                "맛": row["맛"]
            })
    return results

# 메인 실행 코드
if __name__ == "__main__":
    # 데이터 로드
    df = pd.read_csv("data/raw_data.csv", encoding='utf-8')
    
    # CPU 코어 수에 따라 청크 분할
    num_cores = multiprocessing.cpu_count() - 1
    chunk_size = max(1, len(df) // num_cores)
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
    
    augmented_data = []
    
    # 병렬 처리
    with ThreadPoolExecutor(max_workers=num_cores) as executor:
        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
        
        # tqdm으로 진행률 표시
        for future in tqdm(futures, total=len(chunks), desc="Processing chunks"):
            augmented_data.extend(future.result())
    
    # 원본 데이터도 포함
    original_data = df.to_dict('records')
    augmented_data.extend(original_data)
    
    # 결과를 DataFrame으로 변환 및 저장
    final_df = pd.DataFrame(augmented_data)
    final_df.to_csv("data/augmented_data.csv", index=False, encoding='utf-8')

Processing chunks:   0%|          | 0/13 [00:00<?, ?it/s]

Processing chunks: 100%|██████████| 13/13 [00:00<00:00, 161.06it/s]
