In [18]:
import os
import pandas as pd
import re
todas_interviews = {}
folder_path = './transcripts/Discourse-UWO/Baseline'
for filename in os.listdir(folder_path):
    if filename.endswith('.cha') and filename not in ["007.cha"]:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.readlines()[10:]
            while content and not content[0].startswith('@'):
                content.pop(0)
                
            # Divide 'content' en sublistas, cada una empieza con '@' y termina antes del siguiente '@'
            content = [line.strip() for line in content if line.strip() and not line.startswith('@')]  # Elimina líneas vacías y espacios
            content = [re.sub(r'\t',' ',line) for line in content]  # Reemplaza tabs por espacios
            # Elimina cualquier cosa que empiece por \x y continúe hasta el final de la línea
            content = [re.sub(r'.*$', '', line) for line in content]
            # También elimina cualquier secuencia de números tipo _12345 al final
            content = [line[1:].strip() for line in content if line.startswith('*INV:') or line.startswith('*PAR:')]
            if len(content)==0: continue
            
            todas_interviews[filename] = content

In [19]:
todas_processed = {}
#Lo parseamos en (speaker, utterance)
for file in todas_interviews:
    aux = [(line[:3],line[4:].strip()) for line in todas_interviews[file]]
    todas_processed[file] = aux
print(f"Ejemplo: {todas_processed[list(todas_processed.keys())[0]][:5]}")

Ejemplo: [('INV', 'so to begin I would like you to talk about a few things related to your daily life .'), ('INV', "&-um and you don't have to use anyone's name ."), ('INV', 'so describing anything going forward .'), ('INV', "so the first thing I'd like you to talk about &-um is can you tell me a bit about yourself ."), ('PAR', "&-uh try to be outgoing &-uh knowledgeable about what's going on in the world .")]


In [20]:
prep_df = []
for file, intervenciones in todas_processed.items():
    prep_df.extend([(file,i[0],i[1]) for i in intervenciones])
df = pd.DataFrame(prep_df,columns=['filename','role','text'])
df.head(5)

Unnamed: 0,filename,role,text
0,102.cha,INV,so to begin I would like you to talk about a f...
1,102.cha,INV,&-um and you don't have to use anyone's name .
2,102.cha,INV,so describing anything going forward .
3,102.cha,INV,so the first thing I'd like you to talk about ...
4,102.cha,PAR,&-uh try to be outgoing &-uh knowledgeable abo...


----
### Agrupando todas las intervenciones del mismo hablante

In [21]:
# Junta intervenciones consecutivas del mismo hablante
todas_agrupadas = {}
for file, intervenciones in todas_processed.items():
    agrupadas = []
    if not intervenciones:
        print(file)
        todas_agrupadas[file] = agrupadas
        continue
    actual_speaker, actual_text = intervenciones[0]
    for speaker, text in intervenciones[1:]:
        if speaker == actual_speaker:
            actual_text += "\n " + text
        else:
            agrupadas.append((actual_speaker, actual_text))
            actual_speaker, actual_text = speaker, text
    agrupadas.append((actual_speaker, actual_text))  # Añade la última intervención
    todas_agrupadas[file] = agrupadas

# todas_agrupadas

In [22]:
prep_df = []
for file, intervenciones in todas_agrupadas.items():
    prep_df.extend([(file,i[0],i[1]) for i in intervenciones])


In [23]:
df = pd.DataFrame(prep_df,columns=['filename','role','text'])
df

Unnamed: 0,filename,role,text
0,102.cha,INV,so to begin I would like you to talk about a f...
1,102.cha,PAR,&-uh try to be outgoing &-uh knowledgeable abo...
2,102.cha,INV,nice .\n &-um do you wanna tell me about like ...
3,102.cha,PAR,&-uh so automotive technology to get my three ...
4,102.cha,INV,nice .\n that's awesome .\n how what do you wa...
...,...,...,...
17783,028.cha,INV,and then when you're ready to tell me the stor...
17784,028.cha,PAR,okay .
17785,028.cha,INV,perfect go ahead.
17786,028.cha,PAR,&-um so there was a thirsty bird .\n and she s...


In [24]:
df_par = df[df['role'] == 'PAR'].copy()
df_par['question'] = df_par.apply(
    lambda row: df.loc[(df['filename'] == row['filename']) & (df['role'] == 'INV') & (df.index < row.name), 'text'].iloc[-1]
    if not df.loc[(df['filename'] == row['filename']) & (df['role'] == 'INV') & (df.index < row.name), 'text'].empty else None,
    axis=1
)
df_par.reset_index(drop=True, inplace=True)
df_par = df_par[['filename','question','text']]
df_par.columns = ['filename','question','answer']

In [25]:
def add_prompt(sample):
    #load prompt template

    with open('./data/test.prompt','r') as f:
        prompt_template = f.read()
    #fill the template
    prompt = prompt_template.format(input=sample["question"], output='<start> ' + sample["answer"]+' <end>')
    return prompt
df_par['text'] = df_par.apply(add_prompt, axis=1)
df_par.sample(5)

Unnamed: 0,filename,question,answer,text
4540,092.cha,yeah .,so that animals can be well fought well thinki...,You are a patient that has gone to do an inter...
8452,108.cha,and has what suggestions were made or what wor...,&-um I think they put me on the treatment for ...,You are a patient that has gone to do an inter...
5213,026.cha,and your definition of important can be whatev...,mhm .,You are a patient that has gone to do an inter...
6903,089.cha,&-um so again you don't have to use anyone's n...,well I'm an introvert .\n I like to keep to my...,You are a patient that has gone to do an inter...
1135,014.cha,mhm .,and then they kind of calm down whether that's...,You are a patient that has gone to do an inter...


In [27]:
import json
data = df_par[['question','answer','text']].to_dict(orient='records')
with open('./data/discourse_qa.json','w',encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
print(f"Wrote ./data/discourse_qa.json with {len(data)} records")

Wrote ./data/discourse_qa.json with 8834 records


In [10]:
df_par['lenP'] = df_par['text'].apply(lambda x: len(x.split()))
df_par['lenI'] = df_par['question'].apply(lambda x: len(x.split()) if x else 0)