# data reforming to using gpt



In [None]:
import json
import os
import time
import openai
from openai.error import RateLimitError

# Set your OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')  # Recommended method

def translate_text(text):
    if not text.strip():
        return ""  # Return empty string if note is empty
    
    prompt = f"Translate the following Korean text to English:\n\n{text}"
    
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",  # Use 'gpt-4' if you have access
                messages=[
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1024,
                n=1,
                temperature=0.3,
            )
            translated_text = response['choices'][0]['message']['content'].strip()
            return translated_text
        except RateLimitError:
            print("Rate limit exceeded. Waiting for 10 seconds before retrying...")
            time.sleep(10)
        except Exception as e:
            print(f"Error translating text: {e}")
            return text  # Return original text if translation fails

def translate_notes_in_json(input_file, output_file, max_entries=None):
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    audio_features = data.get("audio_features", [])
    translated_data = []

    for idx, feature in enumerate(audio_features):
        if max_entries and idx >= max_entries:
            break
        
        note = feature.get('note', '')
        translated_note = translate_text(note)
        feature['note'] = translated_note  # Update the note with the translated text
        translated_data.append(feature)
        print(f"Translated note for file {feature.get('file_path', '')}")
    
    # Save the augmented data into output_file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump({"audio_features": translated_data}, outfile, ensure_ascii=False, indent=4)
    
    print(f"Translated data has been saved to {output_file}")

if __name__ == "__main__":
    input_json = "audio_features_note.json"  # Input file with Korean notes
    output_json = "audio_features_note_augmentationed.json"  # Output file with translated notes
    # Set max_entries to limit the number of entries processed during testing
    translate_notes_in_json(input_json, output_json, max_entries=None)


### GPT-4o Mini 토큰 비용 계산

#### **주어진 조건**
- **입력:** 200자 미만의 한국어 → 약 `100~120 토큰`
- **출력:** 400자 정도 되는 영어 → 약 `200~250 토큰`
- **데이터 개수:** 400개
- **GPT-4o Mini 토큰 비용:**
  - **입력 토큰:** 1백만 개당 **$0.15**
  - **출력 토큰:** 1백만 개당 **$0.60**

---

### **계산 과정**

1. **평균 입력/출력 토큰 계산**
   - **평균 입력 토큰:** \( \frac{100 + 120}{2} = 110 \)  
   - **평균 출력 토큰:** \( \frac{200 + 250}{2} = 225 \)  
   - **평균 총 토큰 (입력 + 출력):** \( 110 + 225 = 335 \)

2. **총 토큰 수**
   - \( 335 \, \text{토큰/데이터} \times 400 \, \text{데이터} = 134,000 \, \text{토큰} \)

3. **비용 계산**
   - **입력 토큰 비용:**  
     \( 134,000 \, \text{토큰 중 입력 토큰} \)은 대략 \( 110 \, \text{토큰} \)이므로,  
     \( 110 \times 400 = 44,000 \, \text{입력 토큰} \)  
     - 1백만 개당 $0.15:
     \[
     \frac{44,000}{1,000,000} \times 0.15 = 0.0066 \, \text{USD}
     \]
   
   - **출력 토큰 비용:**  
     \( 134,000 \, \text{토큰 중 출력 토큰} \)은 대략 \( 225 \, \text{토큰} \)이므로,  
     \( 225 \times 400 = 90,000 \, \text{출력 토큰} \)  
     - 1백만 개당 $0.60:
     \[
     \frac{90,000}{1,000,000} \times 0.60 = 0.054 \, \text{USD}
     \]

4. **총 비용**
   - **입력 비용 + 출력 비용**:
     \[
     0.0066 + 0.054 = 0.0606 \, \text{USD}
     \]

---

### **결과**
- **총 토큰 수:** 134,000 토큰
- **예상 비용:** 약 **$0.0606** (모델 요금이 $0.15/1M 입력 토큰, $0.60/1M 출력 토큰일 경우)

---

**참고**: 실제 비용은 사용량, 토큰 크기, 요금 정책에 따라 달라질 수 있습니다. OpenAI의 [최신 요금 정책](https://openai.com/api/pricing/?utm_source=chatgpt.com)을 확인하여 정확한 비용을 확인하세요.
