In [16]:
import json
import pandas as pd

In [20]:
def create_dialog_dataset(raw_data_path):

    with open(raw_data_path, "r") as f:
        data = json.load(f)
        
    qa_examples = []

    for dialogue_id, dialogues in data.items():
        for conversation in dialogues:
            conversation_history = []
            for utt in conversation:
                conversation_history.append(utt["utterance"])#(f'{utt["speaker"]}: {utt["utterance"]}')
                if utt["emotion"] != "neutral":
                    context = " ".join(conversation_history)
                    target = utt["utterance"]
                    emotion = utt["emotion"]
                    question = f"The target utterance is '{target}'. What is the causal span that triggers the emotion {emotion}?"
                    causal_spans = utt.get("expanded emotion cause span", [])
                    if causal_spans:
                        answer_text = causal_spans[0]
                    else:
                        answer_text = ""

                    answer_start = context.find(answer_text)
                    if answer_start == -1:
                        answer_start = 0

                    evidence_turns = utt.get("expanded emotion cause evidence", [])
                    if evidence_turns:
                        evidence_val = evidence_turns[0]
                        try:
                            evidence_turn = int(evidence_val)
                        except ValueError:
                            # If conversion fails, you can decide to either skip or assign a default value.
                            evidence_turn = None

                    evidence_utterance = None
                    if evidence_turn and evidence_turn - 1 < len(conversation_history):
                        evidence_utterance = conversation_history[evidence_turn - 1]

                    qa_example = {
                        "dialogue_id": dialogue_id,
                        "context": context,
                        "question": question,
                        "answer": answer_text,
                        "answer_start": answer_start,
                        "evidence_turn": evidence_turn,
                        "evidence_utterance": evidence_utterance
                    }

                    qa_examples.append(qa_example)
                    
    return qa_examples


raw_data_path = "data/original_annotation/dailydialog_train.json"
qa_examples = create_dialog_dataset(raw_data_path)
for ex in qa_examples[:3]:
    print("Dialogue ID:", ex["dialogue_id"])
    print("Context:", ex["context"])
    print("Question:", ex["question"])
    print("Answer:", ex["answer"])
    print("Answer Start:", ex["answer_start"])
    print("Evidence Turn:", ex["evidence_turn"])
    print("Evidence Utterance:", ex["evidence_utterance"])
    print("------")
    



Dialogue ID: tr_4466
Context: Hey , you wanna see a movie tomorrow ?
Question: The target utterance is 'Hey , you wanna see a movie tomorrow ?'. What is the causal span that triggers the emotion happiness?
Answer: see a movie tomorrow ?
Answer Start: 16
Evidence Turn: 1
Evidence Utterance: Hey , you wanna see a movie tomorrow ?
------
Dialogue ID: tr_4466
Context: Hey , you wanna see a movie tomorrow ? Sounds like a good plan . What do you want to see ?
Question: The target utterance is 'Sounds like a good plan . What do you want to see ?'. What is the causal span that triggers the emotion happiness?
Answer: see a movie tomorrow ?
Answer Start: 16
Evidence Turn: 1
Evidence Utterance: Hey , you wanna see a movie tomorrow ?
------
Dialogue ID: tr_4466
Context: Hey , you wanna see a movie tomorrow ? Sounds like a good plan . What do you want to see ? How about Legally Blonde . Ah , my girlfriend wanted to see that movie . I have to take her later so I don't want to watch it ahead of time 

In [21]:
df = pd.DataFrame(qa_examples)

# Save the DataFrame to a CSV file
df.to_csv("train_dataset.csv", index=False)

print("Saved the training dataset to 'train_dataset.csv'.")

Saved the training dataset to 'train_dataset.csv'.


In [26]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.11.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metada

In [34]:
from transformers import AutoTokenizer
from datasets import Dataset

def preprocess_examples(example, tokenizer, max_length=512):
    
    try:
        # (QA setting)
        inputs = tokenizer(example["context"], example["question"],
                           truncation=True, max_length=max_length, padding="max_length")

        # For simplicity, assume a single answer per example.
        inputs = tokenizer(
            example["context"], example["question"],
            truncation=True, max_length=max_length,
            padding="max_length",
            return_offsets_mapping=True
        )

        offsets = inputs.pop("offset_mapping")

        answer_text = example["answer"]
        answer_start = example["answer_start"]

        start_position, end_position = None, None
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_position = idx
            if start < answer_start + len(answer_text) <= end:
                end_position = idx
                break

        if start_position is None:
            start_position = 0
        if end_position is None:
            end_position = 0

        inputs["start_positions"] = start_position
        inputs["end_positions"] = end_position
        return inputs
    
    except Exception as e:
        print(f"[ERROR] Problem with example:\n{example}")
        print(f"Error: {e}")
        return {}

dataset = Dataset.from_dict(df)

tokenizer_spanbert = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

tokenized_dataset_spanbert = dataset.map(lambda x: preprocess_examples(x, tokenizer_spanbert), batched=False, desc="Preprocessing")
tokenized_dataset_roberta = dataset.map(lambda x: preprocess_examples(x, tokenizer_roberta), batched=False, desc="Preprocessing")

Preprocessing:   0%|          | 0/4562 [00:00<?, ? examples/s]

Preprocessing:   0%|          | 0/4562 [00:00<?, ? examples/s]

In [36]:
print("Next cell is executing, previous preprocessing finished.")

# Print examples from SpanBERT tokenized dataset
print(" SpanBERT Tokenized Examples:")
for i in range(2):
    print(tokenized_dataset_spanbert[i])
    print("----")

# Print examples from RoBERTa tokenized dataset
print("\n RoBERTa Tokenized Examples:")
for i in range(2):
    print(tokenized_dataset_roberta[i])
    print("----")

Next cell is executing, previous preprocessing finished.
 SpanBERT Tokenized Examples:
{'dialogue_id': 'tr_4466', 'context': 'Hey , you wanna see a movie tomorrow ?', 'question': "The target utterance is 'Hey , you wanna see a movie tomorrow ?'. What is the causal span that triggers the emotion happiness?", 'answer': 'see a movie tomorrow ?', 'answer_start': 16, 'evidence_turn': 1.0, 'evidence_utterance': 'Hey , you wanna see a movie tomorrow ?', 'input_ids': [101, 4403, 117, 1128, 16445, 1267, 170, 2523, 4911, 136, 102, 1109, 4010, 15462, 3923, 1110, 112, 4403, 117, 1128, 16445, 1267, 170, 2523, 4911, 136, 112, 119, 1327, 1110, 1103, 11019, 25034, 8492, 1115, 9887, 1116, 1103, 7471, 9266, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [37]:
# --- Save the Datasets ---
# Here we use Hugging Face's save_to_disk method to store the datasets.

tokenized_dataset_spanbert.save_to_disk("spanbert_preprocessed_dataset")
tokenized_dataset_roberta.save_to_disk("roberta_preprocessed_dataset")

print("Saved preprocessed datasets:")
print(" - SpanBERT preprocessed dataset saved as 'spanbert_preprocessed_dataset'")
print(" - RoBERTa preprocessed dataset saved as 'roberta_preprocessed_dataset'")

Saving the dataset (0/1 shards):   0%|          | 0/4562 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4562 [00:00<?, ? examples/s]

Saved preprocessed datasets:
 - SpanBERT preprocessed dataset saved as 'spanbert_preprocessed_dataset'
 - RoBERTa preprocessed dataset saved as 'roberta_preprocessed_dataset'
