# Huggingface Datasets

In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt

## Inspect available datasets

In [None]:
emotions = load_dataset("emotion")
emotions

In [None]:
# Get the train dataset
emotions["train"]

In [None]:
emotions["train"].features

In [None]:
# Get the texts of the train dataset
emotions["train"]["text"][:10]

In [None]:
# Get the labels of teh train dataset
emotions["train"]["label"][:10]

Datasets are based on the Apache Arrow format, which is a column-oriented storage format

In [None]:
# Get the first item of the training dataset
emotions["train"][0]

In [None]:
# Iterate over the training dataset
for item in emotions["train"]:
    print("Text", item["text"], "Label", item["label"])
    break

## Transforming label indices into strings and vice versa

In [None]:
print("LABELS", emotions["train"].features["label"])
print("int2str", emotions["train"].features["label"].int2str([0,0,1,1,2,2,3,3,4,4,5,5]))
print("int2str", emotions["train"].features["label"].str2int(["surprise", "sadness", "fear", "joy", "anger", "love"]))

## Transforming dataset into Pandas dataframe

In [None]:
emotions.set_format(type="pandas")
df_train = emotions["train"][:]

## Inspecting class frequency

In [None]:
df_train["label_name"] = df_train["label"].apply(lambda x: emotions["train"].features["label"].int2str(x))
df_train["label_name"].value_counts(ascending=True)

In [None]:
df_train["label"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of classes")

In [None]:
df_train["word_count"] = df_train["text"].str.split().apply(len)
df_train.boxplot("word_count", by="label_name", grid=False)
plt.title("Word count per tweet")
plt.suptitle("")
plt.xlabel("class")
plt.ylabel("word count")

## Undo transformation into Pandas dataframe

In [None]:
emotions.reset_format()