In [10]:
import json
from transformers import pipeline

def load_data(dataset_name, split, task):
    examples = []
    task_str = "tasd" if task in ['tasd', 'acd', 'e2e'] else task
    with open(f"data/{task_str}/{dataset_name}/{split}.txt", "r", encoding="utf-8") as f:
        for line in f:
            text, aspect_str = line.strip().split("####")
            aspect_list = eval(aspect_str)  # besser wäre ast.literal_eval
            
            if considered_sentiment_elements == ["aspect_term", "aspect_category", "sentiment_polarity", "opinion_term"]:
                aspect_list = [
                    {
                        "aspect_term": aspect[0],
                        "aspect_category": aspect[1],
                        "sentiment_polarity": aspect[2],
                        "opinion_term": aspect[3]
                    }
                    for aspect in aspect_list
                ]
            elif considered_sentiment_elements == ["aspect_category"]:
                aspect_list = [
                    {
                        "aspect_category": aspect[1]
                    }
                    for aspect in aspect_list
                ]
                # remove duplicates in aspect_list
                aspect_list = [dict(t) for t in {tuple(d.items()) for d in aspect_list}]
            elif considered_sentiment_elements == ["aspect_term", "aspect_category", "sentiment_polarity"]:
                aspect_list = [
                    {
                        "aspect_term": aspect[0],
                        "aspect_category": aspect[1],
                        "sentiment_polarity": aspect[2]
                    }
                    for aspect in aspect_list
                ]

            examples.append({
                "text": text,
                "label": aspect_list
            })
    return examples

# Initialize the translation pipeline with a Hugging Face model
print("Loading translation model...")
translator = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de")
print("Translation model loaded successfully!")

def translate_to_german(text):
    """Translate text to German using Hugging Face model"""
    try:
        # The model expects a list of texts
        result = translator(text, max_length=512)
        return result[0]['translation_text']
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text  # Return original text if translation fails

considered_sentiment_elements=["aspect_term", "aspect_category", "sentiment_polarity", "opinion_term"]
train_data = load_data("rest16", "train", "asqp")
pool_a = train_data[:100]
pool_b = train_data[100:200]
len(pool_a), len(pool_b)

# Prepare data for saving (remove 'label' key and add 'translation' key)
print("Translating texts to German...")

pool_a_processed = []
for i, item in enumerate(pool_a):
    translation = translate_to_german(item["text"])
    pool_a_processed.append({
        "text": item["text"],
        "translation": translation
    })
    if (i + 1) % 10 == 0:
        print(f"Translated {i + 1}/100 texts for pool_a")

pool_b_processed = []
for i, item in enumerate(pool_b):
    translation = translate_to_german(item["text"])
    pool_b_processed.append({
        "text": item["text"],
        "translation": translation
    })
    if (i + 1) % 10 == 0:
        print(f"Translated {i + 1}/100 texts for pool_b")

# Save pool_a.json and pool_b.json
with open("pool_a.json", "w", encoding="utf-8") as f:
    json.dump(pool_a_processed, f, ensure_ascii=False, indent=2)

with open("pool_b.json", "w", encoding="utf-8") as f:
    json.dump(pool_b_processed, f, ensure_ascii=False, indent=2)

print("Successfully saved pool_a.json and pool_b.json with German translations")

  from .autonotebook import tqdm as notebook_tqdm


Loading translation model...


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Translation model loaded successfully!
Translating texts to German...
Translated 10/100 texts for pool_a
Translated 10/100 texts for pool_a
Translated 20/100 texts for pool_a
Translated 20/100 texts for pool_a
Translated 30/100 texts for pool_a
Translated 30/100 texts for pool_a
Translated 40/100 texts for pool_a
Translated 40/100 texts for pool_a
Translated 50/100 texts for pool_a
Translated 50/100 texts for pool_a
Translated 60/100 texts for pool_a
Translated 60/100 texts for pool_a
Translated 70/100 texts for pool_a
Translated 70/100 texts for pool_a
Translated 80/100 texts for pool_a
Translated 80/100 texts for pool_a
Translated 90/100 texts for pool_a
Translated 90/100 texts for pool_a
Translated 100/100 texts for pool_a
Translated 100/100 texts for pool_a
Translated 10/100 texts for pool_b
Translated 10/100 texts for pool_b
Translated 20/100 texts for pool_b
Translated 20/100 texts for pool_b
Translated 30/100 texts for pool_b
Translated 30/100 texts for pool_b
Translated 40/100 