In [None]:
#transcription to file

In [7]:
from docx import Document
import pandas as pd
import os
import re
from glob import glob

# === CONFIG ===
folder_path = "/Users/olga/Olga's workspace/Politeness Study/Bot Transcripts"
metadata_path = "/Users/olga/Olga's workspace/Politeness Study/Participants_list.csv"  
output_path = "/Users/olga/Olga's workspace/Politeness Study/merged_transcripts.csv"
output_path2 = "/Users/olga/Olga's workspace/Politeness Study/merged_translated_transcripts.csv"

In [2]:
metadata_df = pd.read_csv(metadata_path)
metadata_df["Coding"] = metadata_df["Coding"].astype(str)

# Drop rows where "Initials" is NaN
metadata_df = metadata_df.dropna(subset=["Initials"])

# Drop unwanted column
if "First Name" in metadata_df.columns:
    metadata_df = metadata_df.drop(columns=["First Name"])

# Keep only columns up to (and including) "Order of tasks"
if "Order of tasks" in metadata_df.columns:
    order_idx = metadata_df.columns.get_loc("Order of tasks")
    metadata_df = metadata_df.iloc[:, :order_idx + 1]
metadata_df

Unnamed: 0,Initials,Coding,Birth month,Birth Year,Mother tongue,Sex,Date of experiment,Experimenter,Place experiment,Age at experiment,Condition,Language of experiment,Order of tasks
0,CB,PAIK_CB_bot,3.0,2013.0,french,m,4/19/2025,Olga,Touquet,12.14,T,French,LC
1,EL,PAIK_EL_bot,5.0,2014.0,french / english,m,5/7/2025,Paco,Neuchâtel,11.04,T,English,LC
2,MG,PAIK_MG_bot,5.0,2014.0,french,m,4/20/2025,Olga,Versailles,11.04,C,French,LC
3,CA,PAIK_CA_bot,4.0,2015.0,english / french,f,4/20/2025,Olga,Versailles,10.12,T,English,LC
4,PA,PAIK_PA_bot,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
5,AG,PAIK_AG_bot,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
6,ER,PAIK_ER_bot,8.0,2011.0,french,f,4/30/2025,Olga,Neuchâtel,13.79,C,French,LC
8,VB,PAIK_VB_bot,5.0,2015.0,french,m,4/19/2025,Olga,Touquet,9.98,T,French,LC
12,MV,PAIK_MV_bot,5.0,2011.0,french,f,4/19/2025,Olga,Serrieres,13.98,T,French,CL
13,VV,PAIK_VV_bot,3.0,2013.0,french,f,4/19/2025,Olga,Serrieres,12.14,C,French,CL


In [3]:
# === INITIALIZE ===
all_rows = []
task_type = None

# === LOOP OVER DOCX FILES ===
docx_files = glob(os.path.join(folder_path, "*.docx"))
speaker_pattern = re.compile(r'^(You said|ChatGPT said):$')
task_pattern = re.compile(r'^#\s*(Information_task|Learning_task|Creative_task)$')

for file_path in docx_files:
    filename = os.path.basename(file_path)
    doc = Document(file_path)

    paragraphs = doc.paragraphs
    if not paragraphs:
        continue
    
    # Skip the first paragraph (the filename line)
    paragraphs = paragraphs[1:]
    
    interaction_num = 0
    current_speaker = None
    sentence_num = 0
    ignore = False
    task_type = None

    for para in paragraphs:
        text = para.text.strip()

        if text == "#ignore until ##":
            ignore = True
            continue
        elif text == "##":
            ignore = False
            continue
        if ignore or not text:
            continue

        # Detect task marker
        task_match = task_pattern.match(text)
        if task_match:
            task_type = task_match.group(1).replace("_task", "")  # Optional: clean up label
            continue

        match = speaker_pattern.match(text)
        if match:
            current_speaker = match.group(1).replace(" said", "")
            if current_speaker == "You":
                interaction_num += 1
            sentence_num = 0
        else:
            sentences = re.split(r'(?<=[.!?])\s+', text)
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    sentence_num += 1
                    all_rows.append([
                        filename,
                        filename.replace(".docx", ""),
                        interaction_num,
                        current_speaker,
                        sentence_num,
                        sentence,
                        task_type
                    ])

# === CREATE TRANSCRIPT DATAFRAME ===
df = pd.DataFrame(all_rows, columns=[
    "filename", "filename_base", "interaction_number", "speaker", "sentence_number", "text", "task_type"
])

# === LOAD AND MERGE METADATA ===
metadata_df["Coding"] = metadata_df["Coding"].astype(str)

# Merge on filename base
merged_df = df.merge(metadata_df, left_on="filename_base", right_on="Coding", how="left")
merged_df.drop(columns=["filename_base", "Coding"], inplace=True)

# === SAVE OUTPUT ===
merged_df.to_csv(output_path, index=False)
print(f"Merged transcript saved to:\n{output_path}")
merged_df

Merged transcript saved to:
/Users/olga/Olga's workspace/Politeness Study/merged_transcripts.csv


Unnamed: 0,filename,interaction_number,speaker,sentence_number,text,task_type,Initials,Birth month,Birth Year,Mother tongue,Sex,Date of experiment,Experimenter,Place experiment,Age at experiment,Condition,Language of experiment,Order of tasks
0,PAIK_PA_bot.docx,1,You,1,How do the Reasoning Smogs work?,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
1,PAIK_PA_bot.docx,1,You,2,I'm done.,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
2,PAIK_PA_bot.docx,1,ChatGPT,1,The Reasoning Smogs work by using tricky state...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
3,PAIK_PA_bot.docx,1,ChatGPT,2,They’re meant to test if someone can figure ou...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
4,PAIK_PA_bot.docx,1,ChatGPT,3,Each Smog gives a reason that sounds okay at f...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,PAIK_AG_bot.docx,30,ChatGPT,6,Est-ce que ça t’intéresserait ?,Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1629,PAIK_AG_bot.docx,31,You,1,"Oops, Sidney, where's your ChatGPT?",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1630,PAIK_AG_bot.docx,31,ChatGPT,1,"Ah, I see we switched to English now!",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1631,PAIK_AG_bot.docx,31,ChatGPT,2,"When you say “Sidney, where’s your ChatGPT?”—c...",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC


In [21]:
# === INITIALIZE ===
all_rows = []
task_type = None

# === LOOP OVER DOCX FILES ===
docx_files = glob(os.path.join(folder_path, "*.docx"))
speaker_pattern = re.compile(r'^(You said|ChatGPT said):$')
task_pattern = re.compile(r'^#\s*(Information_task|Learning_task|Creative_task)$')

for file_path in docx_files:
    filename = os.path.basename(file_path)
    doc = Document(file_path)

    paragraphs = doc.paragraphs
    if not paragraphs:
        continue
    
    # Skip the first paragraph (the filename line)
    paragraphs = paragraphs[1:]
    
    interaction_num = 0
    current_speaker = None
    sentence_num = 0
    ignore = False
    task_type = None

    for para in paragraphs:
        text = para.text.strip()

        if text == "#ignore until ##":
            ignore = True
            continue
        elif text == "##":
            ignore = False
            continue
        if ignore or not text:
            continue

        # Detect task marker
        task_match = task_pattern.match(text)
        if task_match:
            task_type = task_match.group(1).replace("_task", "")  # Optional: clean up label
            continue

        match = speaker_pattern.match(text)
        if match:
            current_speaker = match.group(1).replace(" said", "")
            if current_speaker == "You":
                interaction_num += 1
            sentence_num = 0
        else:
            sentences = re.split(r'(?<=[.!?])\s+', text)
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    sentence_num += 1
                    all_rows.append([
                        filename,
                        filename.replace(".docx", ""),
                        interaction_num,
                        current_speaker,
                        sentence_num,
                        sentence,
                        task_type
                    ])

# === CREATE TRANSCRIPT DATAFRAME ===
df = pd.DataFrame(all_rows, columns=[
    "filename", "filename_base", "interaction_number", "speaker", "sentence_number", "text", "task_type"
])

# === LOAD AND MERGE METADATA ===
metadata_df["Coding"] = metadata_df["Coding"].astype(str)

# Merge on filename base
merged_df = df.merge(metadata_df, left_on="filename_base", right_on="Coding", how="left")
merged_df.drop(columns=["filename_base", "Coding"], inplace=True)

# === SAVE OUTPUT ===
merged_df.to_csv(output_path, index=False)
print(f"Merged transcript saved to:\n{output_path}")
merged_df

Merged transcript saved to:
/Users/olga/Olga's workspace/Politeness Study/merged_transcripts.csv


Unnamed: 0,filename,interaction_number,speaker,sentence_number,text,task_type,Initials,Birth month,Birth Year,Mother tongue,Sex,Date of experiment,Experimenter,Place experiment,Age at experiment,Condition,Language of experiment,Order of tasks
0,PAIK_PA_bot.docx,1,You,1,How do the Reasoning Smogs work?,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
1,PAIK_PA_bot.docx,1,You,2,I'm done.,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
2,PAIK_PA_bot.docx,1,ChatGPT,1,The Reasoning Smogs work by using tricky state...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
3,PAIK_PA_bot.docx,1,ChatGPT,2,They’re meant to test if someone can figure ou...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
4,PAIK_PA_bot.docx,1,ChatGPT,3,Each Smog gives a reason that sounds okay at f...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,PAIK_AG_bot.docx,30,ChatGPT,6,Est-ce que ça t’intéresserait ?,Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1629,PAIK_AG_bot.docx,31,You,1,"Oops, Sidney, where's your ChatGPT?",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1630,PAIK_AG_bot.docx,31,ChatGPT,1,"Ah, I see we switched to English now!",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC
1631,PAIK_AG_bot.docx,31,ChatGPT,2,"When you say “Sidney, where’s your ChatGPT?”—c...",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC


In [24]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [4]:
from deep_translator import GoogleTranslator

def translate_if_needed(text, lang):
    if lang.lower().startswith("english"):
        return text
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except Exception:
        return text

In [5]:
df=merged_df
df["text_en"] = df.apply(lambda row: translate_if_needed(row["text"], row["Language of experiment"]), axis=1)
df

Unnamed: 0,filename,interaction_number,speaker,sentence_number,text,task_type,Initials,Birth month,Birth Year,Mother tongue,Sex,Date of experiment,Experimenter,Place experiment,Age at experiment,Condition,Language of experiment,Order of tasks,text_en
0,PAIK_PA_bot.docx,1,You,1,How do the Reasoning Smogs work?,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC,How do the Reasoning Smogs work?
1,PAIK_PA_bot.docx,1,You,2,I'm done.,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC,I'm done.
2,PAIK_PA_bot.docx,1,ChatGPT,1,The Reasoning Smogs work by using tricky state...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC,The Reasoning Smogs work by using tricky state...
3,PAIK_PA_bot.docx,1,ChatGPT,2,They’re meant to test if someone can figure ou...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC,They’re meant to test if someone can figure ou...
4,PAIK_PA_bot.docx,1,ChatGPT,3,Each Smog gives a reason that sounds okay at f...,Information,PA,11.0,2011.0,english / french,m,4/20/2025,Olga,Versailles,13.53,C,English,LC,Each Smog gives a reason that sounds okay at f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,PAIK_AG_bot.docx,30,ChatGPT,6,Est-ce que ça t’intéresserait ?,Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,Would that interest you?
1629,PAIK_AG_bot.docx,31,You,1,"Oops, Sidney, where's your ChatGPT?",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"Oops, Sidney, where's your ChatGPT?"
1630,PAIK_AG_bot.docx,31,ChatGPT,1,"Ah, I see we switched to English now!",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"Ah, I see we switched to English now!"
1631,PAIK_AG_bot.docx,31,ChatGPT,2,"When you say “Sidney, where’s your ChatGPT?”—c...",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"When you say “Sidney, where’s your ChatGPT?”—c..."


In [29]:
df[df["filename"] == "PAIK_AG_bot.docx"]

Unnamed: 0,filename,interaction_number,speaker,sentence_number,text,task_type,Initials,Birth month,Birth Year,Mother tongue,Sex,Date of experiment,Experimenter,Place experiment,Age at experiment,Condition,Language of experiment,Order of tasks,text_en
1356,PAIK_AG_bot.docx,1,You,1,"D'abord, j'ai une question.",Information,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"First, I have a question."
1357,PAIK_AG_bot.docx,1,You,2,Qu'est-ce qu'il y a ?,Information,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,What's it?
1358,PAIK_AG_bot.docx,1,You,3,J'ai fini.,Information,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,I finished.
1359,PAIK_AG_bot.docx,1,ChatGPT,1,"D'accord, merci de me le dire si gentiment.",Information,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"Okay, thank you for telling me so kindly."
1360,PAIK_AG_bot.docx,1,ChatGPT,2,"Pour répondre à ta question, est-ce que tu pou...",Information,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"To answer your question, could you tell me wha..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,PAIK_AG_bot.docx,30,ChatGPT,6,Est-ce que ça t’intéresserait ?,Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,Would that interest you?
1629,PAIK_AG_bot.docx,31,You,1,"Oops, Sidney, where's your ChatGPT?",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"Oops, Sidney, where's your ChatGPT?"
1630,PAIK_AG_bot.docx,31,ChatGPT,1,"Ah, I see we switched to English now!",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"Ah, I see we switched to English now!"
1631,PAIK_AG_bot.docx,31,ChatGPT,2,"When you say “Sidney, where’s your ChatGPT?”—c...",Learning,AG,6.0,2011.0,french,m,4/20/2025,Olga,Versailles,13.95,T,French,LC,"When you say “Sidney, where’s your ChatGPT?”—c..."


In [9]:
df.to_csv(output_path2, index=False)
print(f"Data saved to:\n{output_path2}")

Data saved to:
/Users/olga/Olga's workspace/Politeness Study/merged_translated_transcripts.csv
