In [None]:
import json
import spacy
import csv

nlp = spacy.load("en_core_web_sm")

def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def split_into_sentences(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents] 
    return sentences

def process_reccon_data(data):
    processed_data = []

    for conv_id, conversation in data.items():
        for utterance_list in conversation:
            for utt in utterance_list:
                utterance_text = utt.get("utterance", "")
                emotion = utt.get("emotion", None)
                cause_evidence = utt.get("expanded emotion cause evidence", [])
                cause_span = utt.get("expanded emotion cause span", [])
                speaker = utt.get("speaker", "")
                turn = utt.get("turn", 0)
                utt_type = utt.get("type", [])

                sentences = split_into_sentences(utterance_text)

                sentence_causes = []
                for cause in cause_span:
                    for sentence in sentences:
                        if cause in sentence:
                            sentence_causes.append(sentence)

                for i, sentence in enumerate(sentences, start=1):
                    processed_data.append({
                        "conv_id": conv_id,
                        "turn": turn,
                        "speaker": speaker,
                        "clause_number": i, 
                        "clause": sentence,
                        "emotion": emotion,
                        "expanded emotion cause evidence": cause_evidence,
                        "expanded emotion cause span": [sentence] if sentence in sentence_causes else [],
                        "type": utt_type
                    })

    return processed_data

dataset_path = "dailydialog_train.json" 
csv_output_path = "processed_train.csv"

dataset = load_dataset(dataset_path)
processed_data = process_reccon_data(dataset)

with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["conv_id", "turn", "speaker", "clause_number", "clause", "emotion",
                                              "expanded emotion cause evidence", "expanded emotion cause span", "type"])
    writer.writeheader()
    writer.writerows(processed_data)

print(f"Processed dataset saved to {csv_output_path}")

