Version 1.0, 1-10-2025

# **4. Skript: Übersetzung der Stellungnahmen**

**Annahmen und Voraussetzungen**

- Übersetzung des Text in der Spalte "Bemerkung"
- Falls von der Aggregierung her durchgestrichener Text als unicode Text codiert wurde, wird dieser durchgestrichene Text NICHT übersetzt

**Skript läuft in Azure Machine Learning Studio Empfohlene Compute-Umgebung: 16 Kerne, 64 GB RAM, 400 GB Festplatte (CPU) Kernel: Python 3.10 SDK v2**

**1. Zelle: Installationen von zusätzlichen Libraries**

In [None]:
%pip install langdetect transformers nltk sentencepiece ipywidgets torch openpyxl
%pip install pandas==2.1.1 numpy==1.26.2

**2. Zelle: Imports, define parameters, set hardcoded information**

In [None]:
# 28.3.2025: with folder handling and strikethrough handling WORKS;
# strikethrough handling während Übersetzung: Sätze mit Strikethrough werden NICHT übersetzt
# choose languages at the beginning

import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect
import time
import os
import nltk
from nltk.tokenize import sent_tokenize, LineTokenizer
nltk.download('punkt_tab')
print ("nltk ok")
import math
import torch
import re

#chose input file
INPUTFILE = "/YOUR/INPUT/FOLDER/PATH/HERE/FILE.XLSX"

# define temporary directory for translations:
output_dir = "/YOUR/TEMPORARY/DIRECTORY/PATH/HERE/"
#os.makedirs(output_dir, exist_ok=True)

# choose source and target language below (src/tgt_lang="fr/de/en/it")
SRC_LANG = "fr"
TGT_LANG = "de"

# Path and filename for final file
COMBINED_FILE = f'/YOUR/OUTPUT/PATH/HERE_{SRC_LANG}_to_{TGT_LANG}.xlsx'


**3. Zelle: Hauptskript**

In [None]:
if torch.cuda.is_available():  
    dev = "cuda"
else:  
    dev = "cpu" 
device = torch.device(dev)
print(device)

start_time = time.time()  # Record start time
print(start_time)

# detect language and proceed if corresponds to defined source_language
def is_language(text, source_language=SRC_LANG):
    try:
        return detect(text) == source_language
    except:
        return False

def remove_language_tags(text):
    # This pattern will match <any_characters> including arrows
    pattern = r'(?:<[a-z]{2}/\d{2}/|<de>|<de→|de→|<de >|de>|<fr>|<fr→|fr→|<fr >|fr>|<it>|<it→|it→|<it >|it>|<en>|<en→|en→|<en >|en>|<[a-z]{2}[^>]*>)'  # Catches any other 2-letter language tags
    return re.sub(pattern, '', text).strip()

# function skips translation of sentences that contain striketrhough text
def translate_text(input_text, model, tokenizer, src_lang=SRC_LANG, tgt_lang=TGT_LANG):
    lt = LineTokenizer()
    batch_size = 8
    paragraphs = lt.tokenize(input_text)
    translated_paragraphs = []

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        batches = math.ceil(len(sentences) / batch_size)     
        translated = []
        for i in range(batches):
            sent_batch = sentences[i*batch_size:(i+1)*batch_size]
            translated_batch = []
            
            for sent in sent_batch:
                if '\u0336' in sent:
                    # Keep original sentence if it contains strikethrough
                    translated_batch.append(sent)
                else:
                    # Translate normally if no strikethrough
                    processed_sent = f"<{src_lang}> {sent} <{tgt_lang}>"
                    model_inputs = tokenizer([processed_sent], return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
                    
                    with torch.no_grad():
                        translated_sent = model.generate(
                            **model_inputs,
                            num_beams=5,
                            no_repeat_ngram_size=3,
                            max_length=100,
                        )
                    translated_sent = tokenizer.decode(translated_sent[0], skip_special_tokens=True)
                    translated_sent = remove_language_tags(translated_sent)
                    translated_batch.append(translated_sent)
            
            translated += translated_batch
            
        translated_paragraphs += [" ".join(translated)]

    translated_text = "\n".join(translated_paragraphs)
    return translated_text


def translate_and_add_column(df, model, tokenizer, src_col='Bemerkung', tgt_col=f'Nach_{TGT_LANG}_übersetzte Bemerkung'):
    # Add a new column for translated text if not already present
    if tgt_col not in df.columns:
        index_of_bemerkung = df.columns.get_loc("Bemerkung")  # Get the index of "Bemerkungen" column
        df.insert(index_of_bemerkung + 1, tgt_col, '')  # Insert new column to the right of "Bemerkungen"

    # Iterate through rows and translate if source language is French
    for index, row in df.iterrows():
        if is_language(row[src_col]):
            translated_text = translate_text(row[src_col], model, tokenizer)
            df.at[index, tgt_col] = translated_text

    return df

def main():
    print("choosing model")
    model_name = f'Helsinki-NLP/opus-mt-{SRC_LANG}-{TGT_LANG}'
    model = MarianMTModel.from_pretrained(model_name).to(device)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    input_file = INPUTFILE
    input_xlsx = pd.ExcelFile(input_file)
    print("xls read")

        # Process each sheet
    for sheet_name in input_xlsx.sheet_names:
        print(sheet_name)
        df = pd.read_excel(input_file, sheet_name=sheet_name)
        translated_df = translate_and_add_column(df, model, tokenizer)
        
        # Save to temporary directory
        output_file = os.path.join(output_dir, f'{sheet_name}.xlsx')
        translated_df.to_excel(output_file, index=False)

    # Combine files
    combined_file = COMBINED_FILE

    print("combining") 
    with pd.ExcelWriter(combined_file, engine='openpyxl') as writer:
        for sheet_name in input_xlsx.sheet_names:
            input_file_path = os.path.join(output_dir, f'{sheet_name}.xlsx')
            df = pd.read_excel(input_file_path)
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    print(f"Translated data saved to {combined_file}")

    # Cleanup - remove temporary files
    print("removing tmp files")
    for sheet_name in input_xlsx.sheet_names:
        os.remove(os.path.join(output_dir, f'{sheet_name}.xlsx'))

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed Time: {elapsed_time/60:.2f} minutes")

if __name__ == "__main__":
    main()