# This script translates transcripts of video simulation role-playing exercises from Dutch to English

In [None]:
# Install libraries that might not be installed as necessary (adjust manually) through Anaconda prompt

In [None]:
# Import necessary libraries
import openai          # OpenAI API for AI model access
from openai import OpenAI  # The main client class for interacting with OpenAI's API
import requests        # HTTP library for making web requests (API calls, downloading data, etc.)
import re              # Regular expressions
import os              # Operating system interface
import pandas as pd    # DataFrames for tabular data manipulation and analysis
import numpy as np     # Numerical computing
import openpyxl        # Import excel files
import time            # Time-related functions
import random          # Random number generation
import getpass         # pop-up window for API completion
from tqdm import tqdm  # for showing progress bars

# set seed for reproducibility
random.seed(1337)  # Sets seed for Python's random module

In [None]:
# Load data
test_text = pd.read_excel("anonimized_data_10_partipants.xlsx", sheet_name="data_short")
test_text = test_text.iloc[:4] # keep only first few rows for pilot data analysis purposes
test_text.head()

# # Text data
# text_data = pd.read_csv("data_gitp_text.csv")

Unnamed: 0,DeelnemerNummer,VideoTitel,Beoordeling.607,Beoordeling.608,Beoordeling.609,Beoordeling.610,Beoordeling.all
0,1604031,GITP Videosimulatie Leidinggeven aan veranderi...,[naam] dank je wel dat je even tijd hebt genom...,"Oké, zoals je zelf aangegeven zijn met wat ver...",Dank je wel voor je eerlijkheid. Je landen adv...,"Even kijken. Nou allereerst nogmaals dank, dan...",[naam] dank je wel dat je even tijd hebt genom...
1,1609194,GITP Videosimulatie Leidinggeven aan veranderi...,"Nou, [naam] ik wil toch eventjes met je praten...","Ja, ik begrijp je heel goed. Alleen deze dinge...","Ja, ik begrijp je heel goed, [naam] alleen. Ja...","Wat we op het moment hebben besproken, is dat ...","Nou, [naam] ik wil toch eventjes met je praten..."
2,1604331,GITP Videosimulatie Leidinggeven aan teams 2,Hé dan goed je te zien. Ik heb je vanochtend w...,"Goh ja, ik schrik ervan dat jij zegt dat je he...","Gedaan wat naar om te horen? Ja, ik kan me voo...","Nou ja, dan ik, ik vroeg net hoe ik jou zou ku...",Hé dan goed je te zien. Ik heb je vanochtend w...
3,1618101,GITP Videosimulatie Leidinggeven aan veranderi...,"Oh, ja, jolanda ja, goed dat je even wilde kom...","Oh, ja, nu doe ik het. Ja, [naam] ik begrijp d...","Ja, [naam] ik begrijp dat het lastig is. Het n...","Ja, landen nou, ik verwacht dat je even gaat k...","Oh, ja, jolanda ja, goed dat je even wilde kom..."


# Prompt

In [None]:
# Prompt for Dutch-to-English translation
translation_prompt = """
Translate the following text from Dutch to English.

Context:
- The text comes from a spoken role-playing exercise.
- It was automatically transcribed, so it may contain filler words, hesitation markers, partial sentences, and transcription glitches.

Translation Requirements:
- Convey the intended meaning in fluent English (do NOT translate word-by-word).
- Preserve the *structure* of the utterances: sentence breaks, pacing, repetitions, interruptions, and unfinished thoughts.
- Keep all disfluencies (e.g., "uh", "hm", false starts, repeated words).
- If the transcript contains unclear or broken phrases, translate them as equally unclear or broken in English, without fixing them.
- Do NOT correct grammar, do NOT rewrite into cleaner English, do NOT summarize, and do NOT interpret beyond what is explicitly said.

In other words: translate meaningfully, but keep the *messiness* of spoken language intact.

Text to translate:
```{text}```
"""


# Open AI

In [None]:
# Import OpenAI API key
gpt_api_key = getpass.getpass("Enter API key for OpenAI: ") # manually enter API key to the pop-up window
# Initialize the OpenAI client with your API key
client = OpenAI(api_key=gpt_api_key)

## Function to translate text from Dutch to English

In [43]:
# set defaul parameters
model = "gpt-4.1-mini" # manually change the model
temperature = 0      # manually change the temperature

# Function to get_translation
def get_translation(client, text, prompt_template, model=model):
    try:
        full_prompt = prompt_template.format(text=text.strip())

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a translation engine."},
                {"role": "user", "content": full_prompt}
            ],
            temperature=temperature,  # low temperature for more deterministic output
            max_completion_tokens=10000
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Translation error: {e}")
        return None


# Text translation

In [None]:
# Set parameters
columns_to_translate = ["Beoordeling.607", "Beoordeling.608", "Beoordeling.609", "Beoordeling.610"]
df = test_text # manually CHANGE the dataframe to be analyzed

# Add placeholder columns for the translations
for col in columns_to_translate:
    df[f"en_{col}"] = df[col].astype(object)

# Translation loop
for idx, row in df.iterrows():
    for col in columns_to_translate:
        text = row[col]
        if pd.notna(text) and text.strip():
            translated = get_translation(client, text, translation_prompt)
            df.loc[idx, f"en_{col}"] = translated

    if (idx + 1) % 5 == 0:
        print(f"Translated {idx + 1}/{len(df)} rows")
    time.sleep(0.1)



In [45]:
# Concatenate translated columns into en_Beoordeling.all
translated_columns = [f"en_{col}" for col in columns_to_translate]

# Replace NaN with empty strings before concatenation
test_text["en_Beoordeling.all"] = test_text[translated_columns].fillna("").agg(" ".join, axis=1).str.strip()

In [None]:
# Optional: Save results
test_text.to_csv('test_text_nl_en.csv', index=False)
test_text.to_excel('test_text_nl_en.xlsx', index=False)