In [1]:

import os
import json
from datasets import Dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_dataset_with_metadata(dataset: Dataset, path: str, emotion_labels: list, label_type: str = "multi-label"):
    dataset.save_to_disk(path)
    info_path = os.path.join(path, "dataset_info.json")

    with open(info_path, "r") as f:
        dataset_info = json.load(f)

    dataset_info["label_type"] = label_type
    dataset_info["emotion_labels"] = emotion_labels
    dataset_info["num_labels"] = len(emotion_labels)

    with open(info_path, "w") as f:
        json.dump(dataset_info, f, indent=2)

    print(f"dataset saved to {path} with metadata added.")


# CancerEMO
### Multi label-classification, where each class needs to be predicted seperately from the others (multiple classes can be true)

In [3]:
dataset = load_from_disk("emotion_datasets/src/data/CancerEmo")
print(dataset)
print(dataset[0])

emotion_labels = [label for label in dataset.features.keys() if label != "text"]

def convert_labels(example):
    example["labels"] = [float(bool(example[label])) for label in emotion_labels]
    return example

dataset = dataset.map(convert_labels)
print(dataset[5])

Dataset({
    features: ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust'],
    num_rows: 11642
})
{'text': 'And it will no doubt make me happy in the morning as well!', 'anger': False, 'anticipation': None, 'disgust': None, 'fear': None, 'joy': None, 'sadness': None, 'surprise': None, 'trust': None}
{'text': 'doctors were surprised and pleased with my range of motion at my first post-surgical follow-up.', 'anger': False, 'anticipation': None, 'disgust': None, 'fear': None, 'joy': True, 'sadness': None, 'surprise': None, 'trust': None, 'labels': [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]}


In [4]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/CancerEmo", emotion_labels, "multi-label")

Saving the dataset (1/1 shards): 100%|██████████| 11642/11642 [00:00<00:00, 413180.52 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/CancerEmo with metadata added.





# EmoBank
### Heads up, this is regression (multivariate), so u cant use the classification head of bert

In [5]:
dataset = load_from_disk("emotion_datasets/src/data/EmoBank")
print(dataset)
print(dataset[0])
dataset = dataset.remove_columns("split")
# Set the labels column (which represents V A D)
dataset = dataset.map(lambda x: {"labels": [x["V"], x["A"], x["D"]]})
print(dataset[5])

Dataset({
    features: ['id', 'document', 'category', 'subcategory', 'split', 'V', 'A', 'D', 'text'],
    num_rows: 10062
})
{'id': '110CYL068_1079_1110', 'document': '110CYL068', 'category': 'letters', 'subcategory': 'philanthropic-fundraising', 'split': 'test', 'V': 2.799999952316284, 'A': 3.0999999046325684, 'D': 2.799999952316284, 'text': "If I wasn't working here."}
{'id': '110CYL068_163_275', 'document': '110CYL068', 'category': 'letters', 'subcategory': 'philanthropic-fundraising', 'V': 3.25, 'A': 2.880000114440918, 'D': 3.0, 'text': "Here's another story of success from what might seem like an unlikely source: Goodwill's controller, Juli.", 'labels': [3.25, 2.880000114440918, 3.0]}


In [6]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/EmoBank", ["V", "A", "D"], "multi-variate")

Saving the dataset (1/1 shards): 100%|██████████| 10062/10062 [00:00<00:00, 503292.47 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/EmoBank with metadata added.





# EmotionStimulus
### Multi class-classification

In [7]:
dataset = load_from_disk("emotion_datasets/src/data/EmotionStimulus")
print(dataset)
print(dataset[0])

emotion_labels = [label for label in dataset.features.keys() if label != "text"]

def convert_labels(example):
    bool_labels = [bool(example[label]) for label in emotion_labels]
    if sum(bool_labels) == 0: # When no emotion is present (some datasets have this)
        return {"labels": len(emotion_labels)}
    else:
        return {"labels": bool_labels.index(True)}

dataset = dataset.map(convert_labels)
print(dataset[2300])

Dataset({
    features: ['text', 'anger', 'disgust', 'fear', 'happy', 'sad', 'shame', 'surprise'],
    num_rows: 2407
})
{'text': 'Lennox has always truly wanted to fight for the world title and was happy taking the tough route.', 'anger': False, 'disgust': False, 'fear': False, 'happy': True, 'sad': False, 'shame': False, 'surprise': False}
{'text': 'The bewilderment of the child Nizan can not be overstated.', 'anger': False, 'disgust': False, 'fear': False, 'happy': False, 'sad': False, 'shame': False, 'surprise': True, 'labels': 6}


In [8]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/EmotionStimulus", emotion_labels, "multi-class")

Saving the dataset (1/1 shards): 100%|██████████| 2407/2407 [00:00<00:00, 187589.46 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/EmotionStimulus with metadata added.





# GoodNewsEveryone
### Multi class classification

In [9]:
dataset = load_from_disk("emotion_datasets/src/data/GoodNewsEveryone")
print(dataset)
print(dataset[0])
print(dataset.unique("intensity"))
filtered_dataset = dataset.filter(lambda example: example["intensity"] == 'medium')
print(filtered_dataset) # This shows only 10 rows have a different intensity, making it so we can just drop the intensity column
dataset = dataset.remove_columns("intensity")
emotion_labels = [emotion for emotion, feature in dataset.features.items() if str(feature.dtype) == "bool"]
print(emotion_labels)

def convert_labels(example):
    bool_labels = [bool(example[label]) for label in emotion_labels]
    if sum(bool_labels) == 0: # When no emotion is present (some datasets have this)
        return {"labels": len(emotion_labels)}
    else:
        return {"labels": bool_labels.index(True)}

dataset = dataset.map(convert_labels)
print(dataset[10])


Dataset({
    features: ['id', 'source', 'text', 'intensity', 'anger', 'annoyance', 'disgust', 'fear', 'guilt', 'joy', 'love_including_like', 'negative_anticipation_including_pessimism', 'negative_surprise', 'positive_anticipation_including_optimism', 'positive_surprise', 'pride', 'sadness', 'shame', 'trust'],
    num_rows: 5000
})
{'id': '86693d59', 'source': 'dailymail', 'text': "DIY penis enlargements are a 'nationwide problem' in Papua New Guinea", 'intensity': 'medium', 'anger': False, 'annoyance': False, 'disgust': False, 'fear': False, 'guilt': False, 'joy': False, 'love_including_like': False, 'negative_anticipation_including_pessimism': False, 'negative_surprise': True, 'positive_anticipation_including_optimism': False, 'positive_surprise': False, 'pride': False, 'sadness': False, 'shame': False, 'trust': False}
['medium', 'high', None, 'weak', 'low']
Dataset({
    features: ['id', 'source', 'text', 'intensity', 'anger', 'annoyance', 'disgust', 'fear', 'guilt', 'joy', 'love_in

In [10]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/GoodNewsEveryone", emotion_labels, "multi-class")

Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 183947.79 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/GoodNewsEveryone with metadata added.





# Semeval2018Intensity

### Multivariate regression

In [11]:
dataset = load_from_disk("emotion_datasets/src/data/Semeval2018Intensity")
print(dataset)
print(dataset[0])
dataset = dataset.filter(lambda example: example["id"] is not None and example["text"] is not None)
print(dataset)

def replace_none(example):
    return {k: (0.0 if v is None else v) for k, v in example.items()}

dataset = dataset.map(replace_none)
print(dataset[0])

emotion_labels = [
    col for col, feature in dataset.features.items()
    if hasattr(feature, "dtype") and "float" in str(feature.dtype)
]
print(emotion_labels)

def make_labels(example):
    return {"labels": [example[emo] for emo in emotion_labels]}

dataset = dataset.map(make_labels)
print(dataset)
print(dataset[0])

Dataset({
    features: ['id', 'text', 'valence', 'anger', 'fear', 'joy', 'sadness'],
    num_rows: 11288
})
{'id': '2018-En-02381', 'text': "Did men call themselves shy and mean it? So I reassure him that I'm just making sure he's a good investment and alla that 🙄", 'valence': 0.593, 'anger': 0.379, 'fear': 0.365, 'joy': None, 'sadness': 0.35}
Dataset({
    features: ['id', 'text', 'valence', 'anger', 'fear', 'joy', 'sadness'],
    num_rows: 11090
})
{'id': '2018-En-02381', 'text': "Did men call themselves shy and mean it? So I reassure him that I'm just making sure he's a good investment and alla that 🙄", 'valence': 0.593, 'anger': 0.379, 'fear': 0.365, 'joy': 0.0, 'sadness': 0.35}
['valence', 'anger', 'fear', 'joy', 'sadness']
Dataset({
    features: ['id', 'text', 'valence', 'anger', 'fear', 'joy', 'sadness', 'labels'],
    num_rows: 11090
})
{'id': '2018-En-02381', 'text': "Did men call themselves shy and mean it? So I reassure him that I'm just making sure he's a good investment 

In [12]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/Semeval2018Intensity", emotion_labels, "multi-variate")

Saving the dataset (1/1 shards): 100%|██████████| 11090/11090 [00:00<00:00, 553168.48 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/Semeval2018Intensity with metadata added.





# SentimentalLIAR

### Multivariate regression

### this one has a sentiment score, sentiment, speaker and context (bit iffy) but i chose to just not care about those. since we want to generalise here, and training based on all those features is just gonna be a pain. so we use statement which i renamed to text

In [13]:
dataset = load_from_disk("emotion_datasets/src/data/SentimentalLIAR")
print(dataset)
print(dataset[0])
dataset = dataset.remove_columns(["subject", "speaker", "context", "sentiment", "sentiment_score"])
dataset = dataset.rename_column("statement", "text")
print(dataset)
emotion_labels = [
    col for col, feature in dataset.features.items()
    if hasattr(feature, "dtype") and "float" in str(feature.dtype)
]
print(emotion_labels)

def make_labels(example):
    return {"labels": [example[emo] for emo in emotion_labels]}

dataset = dataset.map(make_labels)
print(dataset)
print(dataset[0])

Dataset({
    features: ['ID', 'statement', 'subject', 'speaker', 'context', 'sentiment', 'sentiment_score', 'anger', 'fear', 'joy', 'disgust', 'sad'],
    num_rows: 12786
})
{'ID': '11972.json', 'statement': 'Building a wall on the U.S.-Mexico border will take literally years.', 'subject': 'immigration', 'speaker': 'rick-perry', 'context': 'Radio interview', 'sentiment': 'NEGATIVE', 'sentiment_score': -0.20000000298023224, 'anger': 0.067151, 'fear': 0.155968, 'joy': 0.368879, 'disgust': 0.198711, 'sad': 0.311238}
Dataset({
    features: ['ID', 'text', 'anger', 'fear', 'joy', 'disgust', 'sad'],
    num_rows: 12786
})
['anger', 'fear', 'joy', 'disgust', 'sad']
Dataset({
    features: ['ID', 'text', 'anger', 'fear', 'joy', 'disgust', 'sad', 'labels'],
    num_rows: 12786
})
{'ID': '11972.json', 'text': 'Building a wall on the U.S.-Mexico border will take literally years.', 'anger': 0.067151, 'fear': 0.155968, 'joy': 0.368879, 'disgust': 0.198711, 'sad': 0.311238, 'labels': [0.067151, 0.1

In [14]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/SentimentalLIAR", emotion_labels, "multi-variate")

Saving the dataset (1/1 shards): 100%|██████████| 12786/12786 [00:00<00:00, 632395.12 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/SentimentalLIAR with metadata added.





# TalesEmotions

### Multi-class classification

In [15]:
dataset = load_from_disk("emotion_datasets/src/data/TalesEmotions")
print(dataset)
print(dataset[0])


def replace_twos(example):
    return {k: (1 if v == 2 else v) for k, v in example.items()}
dataset = dataset.map(replace_twos)
print(dataset[0])

emotion_labels = [
    col for col, feature in dataset.features.items()
    if hasattr(feature, "dtype") and "int" in str(feature.dtype)
]
print(emotion_labels)

def convert_labels(example):
    bool_labels = [bool(example[label]) for label in emotion_labels]
    if sum(bool_labels) == 0: # When no emotion is present (some datasets have this)
        return {"labels": len(emotion_labels)}
    else:
        return {"labels": bool_labels.index(True)}

dataset = dataset.map(convert_labels)

print(dataset)
print(dataset[0])

Dataset({
    features: ['author', 'story', 'sent_id', 'text', 'angry', 'disgusted', 'fearful', 'happy', 'neutral', 'positively surprised', 'negatively surprised'],
    num_rows: 15302
})
{'author': 'Potter', 'story': 'the_tale_of_mr_jeremy_fisher', 'sent_id': '0', 'text': 'Once upon a time there was a frog called Mr. Jeremy Fisher; he lived in a little damp house amongst the buttercups at the edge of a pond.', 'angry': 0, 'disgusted': 0, 'fearful': 0, 'happy': 0, 'neutral': 2, 'positively surprised': 0, 'negatively surprised': 0}
{'author': 'Potter', 'story': 'the_tale_of_mr_jeremy_fisher', 'sent_id': '0', 'text': 'Once upon a time there was a frog called Mr. Jeremy Fisher; he lived in a little damp house amongst the buttercups at the edge of a pond.', 'angry': 0, 'disgusted': 0, 'fearful': 0, 'happy': 0, 'neutral': 1, 'positively surprised': 0, 'negatively surprised': 0}
['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'positively surprised', 'negatively surprised']
Dataset({
  

In [16]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/TalesEmotions", emotion_labels, "multi-class")

Saving the dataset (1/1 shards): 100%|██████████| 15302/15302 [00:00<00:00, 446304.33 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/TalesEmotions with metadata added.





# UsVsThem

### Multi-label classification (classes need to be predicted as 0-1 independently)

In [17]:
dataset = load_from_disk("emotion_datasets/src/data/UsVsThem")
print(dataset)
print(dataset[0])

emotion_labels = [emotion for emotion, feature in dataset.features.items() if str(feature.dtype) == "bool"]
print(emotion_labels)

def convert_labels(example):
    example["labels"] = [float(bool(example[label])) for label in emotion_labels]
    return example

dataset = dataset.map(convert_labels)
print(dataset[0])

Dataset({
    features: ['bias', 'group', 'usVSthem_scale', 'text', 'anger', 'contempt', 'disgust', 'fear', 'gratitude', 'guilt', 'happiness', 'hope', 'pride', 'relief', 'sadness', 'sympathy', 'neutral'],
    num_rows: 6864
})
{'bias': 'left', 'group': 'Conservatives', 'usVSthem_scale': 0.8030909712825848, 'text': "Well yeah but then so does the Republican congress' complete acquiescence and obsequious covering for Trump.  We shouldn't be surprised. Mainstream GOP members in the US are entirely willing to align themselves with outright fascists if they think it will help them politically.", 'anger': False, 'contempt': True, 'disgust': True, 'fear': False, 'gratitude': False, 'guilt': False, 'happiness': False, 'hope': False, 'pride': False, 'relief': False, 'sadness': False, 'sympathy': False, 'neutral': False}
['anger', 'contempt', 'disgust', 'fear', 'gratitude', 'guilt', 'happiness', 'hope', 'pride', 'relief', 'sadness', 'sympathy', 'neutral']
{'bias': 'left', 'group': 'Conservatives

In [18]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/UsVsThem", emotion_labels, "multi-label")

Saving the dataset (1/1 shards): 100%|██████████| 6864/6864 [00:00<00:00, 182660.71 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/UsVsThem with metadata added.





# WASSA22
# Multi-class classification (only 1 emotion true)
### I dropped empathy and distress since they are float values (could be used for a greates task like classificaatio nwith the use of regression, but we are not doing that)

In [19]:
dataset = load_from_disk("emotion_datasets/src/data/WASSA22")
print(dataset)
print(dataset[0])
dataset = dataset.remove_columns(["empathy", "distress"])
print(dataset)

emotion_labels = [emotion for emotion, feature in dataset.features.items() if str(feature.dtype) == "bool"]
print(emotion_labels)


def convert_labels(example):
    bool_labels = [bool(example[label]) for label in emotion_labels]
    if sum(bool_labels) == 0: # When no emotion is present (some datasets have this)
        return {"labels": len(emotion_labels)}
    else:
        return {"labels": bool_labels.index(True)}

dataset = dataset.map(convert_labels)

print(dataset)
print(dataset[0])



Dataset({
    features: ['id', 'article_id', 'text', 'empathy', 'distress', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
    num_rows: 2130
})
{'id': 'R_1hGrPtWM4SumG0U_3', 'article_id': '206', 'text': "no matter what your heritage, you should be able to serve your country.  her thai heritage shouldn't preclude her and shouldn't have been an issue in this debate.  tammy duckworth and her family should be congratulated on the services they have provided to this country.  any type of racism should not be allowed in a debate", 'empathy': 5.333000183105469, 'distress': 3.5, 'anger': False, 'disgust': False, 'fear': False, 'joy': False, 'neutral': True, 'sadness': False, 'surprise': False}
Dataset({
    features: ['id', 'article_id', 'text', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
    num_rows: 2130
})
['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
Dataset({
    features: ['id', 'article_id', 'text', 'anger', 'dis

In [20]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/WASSA22", emotion_labels, "multi-class")

Saving the dataset (1/1 shards): 100%|██████████| 2130/2130 [00:00<00:00, 107924.32 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/WASSA22 with metadata added.





# XED
### Multi-label classification (classes need to be predicted as 0-1 independently)

In [21]:
dataset = load_from_disk("emotion_datasets/src/data/XED")
print(dataset)
print(dataset[10])

emotion_labels = [emotion for emotion, feature in dataset.features.items() if str(feature.dtype) == "bool"]
print(emotion_labels)

def convert_labels(example):
    example["labels"] = [float(bool(example[label])) for label in emotion_labels]
    return example

dataset = dataset.map(convert_labels)
print(dataset[0])

Dataset({
    features: ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'neutral'],
    num_rows: 27195
})
{'text': 'A little dicey doing a background on an FBI agent .', 'anger': True, 'anticipation': False, 'disgust': False, 'fear': False, 'joy': False, 'sadness': False, 'surprise': False, 'trust': False, 'neutral': False}
['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'neutral']
{'text': ', ...', 'anger': True, 'anticipation': False, 'disgust': False, 'fear': False, 'joy': False, 'sadness': False, 'surprise': False, 'trust': False, 'neutral': False, 'labels': [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


In [22]:
save_dataset_with_metadata(dataset, "emotion_datasets/src/Preprocessed/XED", emotion_labels, "multi-label")

Saving the dataset (1/1 shards): 100%|██████████| 27195/27195 [00:00<00:00, 526840.35 examples/s]

dataset saved to emotion_datasets/src/Preprocessed/XED with metadata added.



