In [None]:
# Install required packages if needed
# !pip install openai pandas python-dotenv

In [None]:
import openai
import pandas as pd
import os
from time import sleep
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

## 1. Setup OpenAI API

Make sure to set your OpenAI API key in environment variables or .env file.

In [None]:
# Set your OpenAI API key
# Option 1: From environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Option 2: Direct assignment (not recommended for production)
# openai.api_key = "your-api-key-here"

if not openai.api_key:
    print("WARNING: OpenAI API key not found!")
    print("Please set OPENAI_API_KEY environment variable or assign it directly.")
else:
    print("OpenAI API key loaded successfully!")

## 2. Define Translation Function

In [None]:
def translate_with_openai(texts, model="gpt-3.5-turbo", temperature=0.0):
    """
    Translate a list of Hinglish texts to English using OpenAI API.
    
    Args:
        texts: List of input texts to translate
        model: OpenAI model to use (gpt-3.5-turbo, gpt-4, etc.)
        temperature: Sampling temperature (0 = deterministic)
    
    Returns:
        List of translated texts
    """
    outputs = []
    
    system_prompt = (
        "You are a professional translator specializing in Hinglish to English translation. "
        "Translate the given Hinglish text to natural, fluent English. "
        "Preserve the meaning and intent of the original text. "
        "Only provide the translation, no additional commentary."
    )
    
    for text in texts:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Translate to English: {text}"}
                ],
                temperature=temperature,
                max_tokens=100
            )
            
            translation = response.choices[0].message.content.strip()
            outputs.append(translation)
            
            # Rate limiting - adjust as needed
            sleep(0.5)
            
        except Exception as e:
            print(f"Error translating '{text}': {e}")
            outputs.append(f"[ERROR: {str(e)}]")
    
    return outputs

## 3. Define Sentence Triplets

In [None]:
# Example sentence triplets
sentence_triplets = [
    {
        "triplet_id": 1,
        "base": "train kaha tak jati hai",
        "variant_topic_fronting": "kaha tak jati hai train",
        "variant_emphasis_shift": "jati hai kaha tak train"
    },
    # Add more triplets here as needed
]

print(f"Loaded {len(sentence_triplets)} sentence triplets")

## 4. Translate All Variants

**Note**: This will make API calls and may incur costs. Adjust the model parameter as needed.

In [None]:
# Choose your model
# Options: "gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"
MODEL = "gpt-3.5-turbo"

results = []

for triplet in sentence_triplets:
    triplet_id = triplet["triplet_id"]
    print(f"\nProcessing triplet {triplet_id}...")
    
    # Translate each variant
    base_trans = translate_with_openai([triplet["base"]], model=MODEL)[0]
    print(f"  Base: {base_trans}")
    
    topic_trans = translate_with_openai([triplet["variant_topic_fronting"]], model=MODEL)[0]
    print(f"  Topic fronting: {topic_trans}")
    
    emphasis_trans = translate_with_openai([triplet["variant_emphasis_shift"]], model=MODEL)[0]
    print(f"  Emphasis shift: {emphasis_trans}")
    
    results.append({
        "triplet_id": triplet_id,
        "variant_type": "base",
        "input_hinglish": triplet["base"],
        "llm_translation": base_trans,
        "model": MODEL
    })
    
    results.append({
        "triplet_id": triplet_id,
        "variant_type": "topic_fronting",
        "input_hinglish": triplet["variant_topic_fronting"],
        "llm_translation": topic_trans,
        "model": MODEL
    })
    
    results.append({
        "triplet_id": triplet_id,
        "variant_type": "emphasis_shift",
        "input_hinglish": triplet["variant_emphasis_shift"],
        "llm_translation": emphasis_trans,
        "model": MODEL
    })

print(f"\nTranslation complete! Generated {len(results)} translations.")

## 5. Display Results

In [None]:
# Create DataFrame for better visualization
df_results = pd.DataFrame(results)
df_results

In [None]:
# Display grouped by triplet
for triplet_id in df_results['triplet_id'].unique():
    print(f"\n{'='*80}")
    print(f"TRIPLET {triplet_id}")
    print(f"{'='*80}")
    triplet_df = df_results[df_results['triplet_id'] == triplet_id]
    
    for _, row in triplet_df.iterrows():
        print(f"\n{row['variant_type'].upper()}:")
        print(f"  Input:  {row['input_hinglish']}")
        print(f"  Output: {row['llm_translation']}")

## 6. Save Results

In [None]:
# Save to CSV
output_file = f"llm_translations_{MODEL.replace('-', '_')}.csv"
df_results.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

## 7. Experiment with Different Prompts (Optional)

In [None]:
def translate_with_custom_prompt(text, custom_system_prompt, model="gpt-3.5-turbo"):
    """
    Test different system prompts for translation.
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": custom_system_prompt},
                {"role": "user", "content": text}
            ],
            temperature=0.0,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[ERROR: {str(e)}]"

# Test different prompts
test_sentence = "train kaha tak jati hai"

prompts_to_test = [
    "Translate the following Hinglish sentence to English:",
    "Convert this Hindi-English mixed text to pure English:",
    "You are an expert in Hinglish. Translate to English naturally:"
]

# for prompt in prompts_to_test:
#     result = translate_with_custom_prompt(test_sentence, prompt)
#     print(f"Prompt: {prompt}")
#     print(f"Result: {result}")
#     print()
#     sleep(1)

## 8. Load and Translate from Dataset (Optional)

In [None]:
import json

# Load from dataset file
# with open('dataset/db.json', 'r', encoding='utf-8') as f:
#     dataset = json.load(f)

# Process dataset...
# Add your data loading logic here

## 9. Cost Estimation

Track approximate costs for API usage.

In [None]:
# Approximate token counts and costs (as of 2024)
# Update these based on current OpenAI pricing

PRICING = {
    "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},  # per 1K tokens
    "gpt-4": {"input": 0.03, "output": 0.06},
    "gpt-4-turbo-preview": {"input": 0.01, "output": 0.03}
}

def estimate_cost(num_sentences, model="gpt-3.5-turbo", avg_tokens_per_sentence=50):
    """
    Estimate the cost of translation.
    """
    if model not in PRICING:
        return "Unknown model"
    
    input_tokens = num_sentences * avg_tokens_per_sentence
    output_tokens = num_sentences * avg_tokens_per_sentence * 0.8  # Assume slightly shorter output
    
    input_cost = (input_tokens / 1000) * PRICING[model]["input"]
    output_cost = (output_tokens / 1000) * PRICING[model]["output"]
    
    total_cost = input_cost + output_cost
    
    return {
        "model": model,
        "sentences": num_sentences,
        "estimated_input_tokens": input_tokens,
        "estimated_output_tokens": output_tokens,
        "estimated_cost_usd": round(total_cost, 4)
    }

# Estimate cost for your dataset
num_translations = len(results)
cost_estimate = estimate_cost(num_translations, MODEL)
print("Cost Estimate:")
for key, value in cost_estimate.items():
    print(f"  {key}: {value}")