In [1]:
import json
import pandas as pd

In [20]:
def create_dialog_dataset(raw_data_path):

    with open(raw_data_path, "r") as f:
        data = json.load(f)
        
    qa_examples = []

    for dialogue_id, dialogues in data.items():
        for conversation in dialogues:
            conversation_history = []
            for utt in conversation:
                conversation_history.append(utt["utterance"])#(f'{utt["speaker"]}: {utt["utterance"]}')
                if utt["emotion"] != "neutral":
                    context = " ".join(conversation_history)
                    target = utt["utterance"]
                    emotion = utt["emotion"]
                    question = f"The target utterance is '{target}'. What is the causal span that triggers the emotion {emotion}?"
                    causal_spans = utt.get("expanded emotion cause span", [])
                    if causal_spans:
                        answer_text = causal_spans[0]
                    else:
                        answer_text = ""

                    answer_start = context.find(answer_text)
                    if answer_start == -1:
                        answer_start = 0

                    evidence_turns = utt.get("expanded emotion cause evidence", [])
                    if evidence_turns:
                        evidence_val = evidence_turns[0]
                        try:
                            evidence_turn = int(evidence_val)
                        except ValueError:
                            # If conversion fails, you can decide to either skip or assign a default value.
                            evidence_turn = None

                    evidence_utterance = None
                    if evidence_turn and evidence_turn - 1 < len(conversation_history):
                        evidence_utterance = conversation_history[evidence_turn - 1]

                    qa_example = {
                        "dialogue_id": dialogue_id,
                        "context": context,
                        "question": question,
                        "answer": answer_text,
                        "answer_start": answer_start,
                        "evidence_turn": evidence_turn,
                        "evidence_utterance": evidence_utterance
                    }

                    qa_examples.append(qa_example)
                    
    return qa_examples


In [21]:
from transformers import AutoTokenizer
from datasets import Dataset

def preprocess_examples(example, tokenizer, max_length=512):
    try:
        # Tokenize context and question with offset mapping.
        inputs = tokenizer(
            example["context"],
            example["question"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_offsets_mapping=True
        )
        # Retrieve and remove the offset mapping.
        offsets = inputs.pop("offset_mapping")
        
        answer_text = example["answer"]
        answer_start = example["answer_start"]

        start_position, end_position = None, None
        # Find token indices corresponding to the answer span.
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_position = idx
            if start < answer_start + len(answer_text) <= end:
                end_position = idx
                break

        if start_position is None:
            print(f"[WARNING] Start position not found for answer: '{answer_text}' in context.")
            start_position = 0
        if end_position is None:
            print(f"[WARNING] End position not found for answer: '{answer_text}' in context.")
            end_position = 0

        inputs["start_positions"] = start_position
        inputs["end_positions"] = end_position
        return inputs

    except Exception as e:
        print(f"[ERROR] Problem with example:\n{example}\nError: {e}")
        return {}



### Loading Data

In [22]:
def load_and_create_dataset(raw_data_path):
    qa_examples = create_dialog_dataset(raw_data_path)
    df = pd.DataFrame(qa_examples)
    return Dataset.from_dict(df)

# Define file paths (adjust according to your folder structure)
train_data_path = "data/original_annotation/dailydialog_train.json"
val_data_path = "data/original_annotation/dailydialog_valid.json"
test_data_path = "data/original_annotation/dailydialog_test.json"

# Load raw datasets.
train_dataset = load_and_create_dataset(train_data_path)
val_dataset   = load_and_create_dataset(val_data_path)
test_dataset  = load_and_create_dataset(test_data_path)


## Pre-process and Data

In [24]:
# Initialize tokenizers for SpanBERT and RoBERTa.
tokenizer_spanbert = AutoTokenizer.from_pretrained("SpanBert/spanbert-base-cased")
tokenizer_roberta  = AutoTokenizer.from_pretrained("roberta-base")

# Preprocess (tokenize) each dataset for each model.
print("Preprocessing training set with SpanBERT tokenizer...")
tokenized_train_spanbert = train_dataset.map(lambda x: preprocess_examples(x, tokenizer_spanbert),
                                               batched=False, 
                                               desc="Preprocessing Train @ SpanBERT")
print("Preprocessing validation set with SpanBERT tokenizer...")
tokenized_val_spanbert = val_dataset.map(lambda x: preprocess_examples(x, tokenizer_spanbert),
                                           batched=False, 
                                           desc="Preprocessing Val @ SpanBERT")
print("Preprocessing test set with SpanBERT tokenizer...")
tokenized_test_spanbert = test_dataset.map(lambda x: preprocess_examples(x, tokenizer_spanbert),
                                             batched=False, 
                                             desc="Preprocessing Test @ SpanBERT")

print("Preprocessing training set with RoBERTa tokenizer...")
tokenized_train_roberta = train_dataset.map(lambda x: preprocess_examples(x, tokenizer_roberta),
                                              batched=False, 
                                              desc="Preprocessing Train @ RoBERTa")
print("Preprocessing validation set with RoBERTa tokenizer...")
tokenized_val_roberta = val_dataset.map(lambda x: preprocess_examples(x, tokenizer_roberta),
                                          batched=False, 
                                          desc="Preprocessing Val @ RoBERTa")
print("Preprocessing test set with RoBERTa tokenizer...")
tokenized_test_roberta = test_dataset.map(lambda x: preprocess_examples(x, tokenizer_roberta),
                                            batched=False, 
                                            desc="Preprocessing Test @ RoBERTa")

# -------------------------
# 4. Save Preprocessed Datasets to Disk
# -------------------------
tokenized_train_spanbert.save_to_disk("SpanBert/spanbert_preprocessed_train_dataset")
tokenized_val_spanbert.save_to_disk("SpanBert/spanbert_preprocessed_val_dataset")
tokenized_test_spanbert.save_to_disk("SpanBert/spanbert_preprocessed_test_dataset")

tokenized_train_roberta.save_to_disk("Roberta/roberta_preprocessed_train_dataset")
tokenized_val_roberta.save_to_disk("Roberta/roberta_preprocessed_val_dataset")
tokenized_test_roberta.save_to_disk("Roberta/roberta_preprocessed_test_dataset")

print("Saved preprocessed datasets:")
print(" - SpanBERT: train, val, test saved as 'spanbert_preprocessed_*_dataset'")
print(" - RoBERTa: train, val, test saved as 'roberta_preprocessed_*_dataset'")

Preprocessing training set with SpanBERT tokenizer...


Preprocessing Train @ SpanBERT:  55%|█████████████████████                 | 2522/4562 [00:01<00:01, 1367.47 examples/s]



Preprocessing Train @ SpanBERT:  71%|███████████████████████████           | 3245/4562 [00:02<00:01, 1219.14 examples/s]



Preprocessing Train @ SpanBERT:  99%|█████████████████████████████████████▋| 4529/4562 [00:03<00:00, 1367.55 examples/s]



Preprocessing Train @ SpanBERT: 100%|██████████████████████████████████████| 4562/4562 [00:03<00:00, 1340.79 examples/s]


Preprocessing validation set with SpanBERT tokenizer...


Preprocessing Val @ SpanBERT: 100%|██████████████████████████████████████████| 200/200 [00:00<00:00, 1275.47 examples/s]


Preprocessing test set with SpanBERT tokenizer...


Preprocessing Test @ SpanBERT:   0%|                                                    | 0/1099 [00:00<?, ? examples/s]



Preprocessing Test @ SpanBERT: 100%|███████████████████████████████████████| 1099/1099 [00:00<00:00, 1300.91 examples/s]


Preprocessing training set with RoBERTa tokenizer...


Preprocessing Train @ RoBERTa:  78%|██████████████████████████████▌        | 3581/4562 [00:02<00:00, 1611.58 examples/s]



Preprocessing Train @ RoBERTa:  96%|█████████████████████████████████████▍ | 4381/4562 [00:02<00:00, 1482.34 examples/s]



Preprocessing Train @ RoBERTa: 100%|███████████████████████████████████████| 4562/4562 [00:02<00:00, 1577.44 examples/s]


Preprocessing validation set with RoBERTa tokenizer...


Preprocessing Val @ RoBERTa: 100%|███████████████████████████████████████████| 200/200 [00:00<00:00, 1499.39 examples/s]


Preprocessing test set with RoBERTa tokenizer...


Preprocessing Test @ RoBERTa:   0%|                                                     | 0/1099 [00:00<?, ? examples/s]



Preprocessing Test @ RoBERTa: 100%|████████████████████████████████████████| 1099/1099 [00:00<00:00, 1524.03 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 4562/4562 [00:00<00:00, 17653.24 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████████████████| 200/200 [00:00<00:00, 5042.78 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 1099/1099 [00:00<00:00, 13757.56 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 4562/4562 [00:00<00:00, 20597.12 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████████████████| 200/200 [00:00<00:00, 7739.57 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 1099/1099 [00:00<00:00, 11472.91 examples/s]

Saved preprocessed datasets:
 - SpanBERT: train, val, test saved as 'spanbert_preprocessed_*_dataset'
 - RoBERTa: train, val, test saved as 'roberta_preprocessed_*_dataset'



