# SSEC trained vs human annotation

We train a feature-based ML model on the SSEC corpus, which is annotated with Plutchik's eight emotions (multi-label).
The trained model will then be evaluated on a corpus with our own annotation to compare the human annotations with learned
labels from the model.

In [210]:
import pandas
import nltk
from sklearn.metrics import multilabel_confusion_matrix as mcm

## Corpus data

In [211]:
# File with all corpora
with open('unified-dataset.json', encoding='utf-8') as f:
    data_source = pandas.read_json(f, lines=True)

# Select only SSEC corpus
data_source = data_source.loc[(data_source['source'] == "ssec")]
# Convert emotions dict to columns
data_source = data_source.join(data_source['emotions'].apply(pandas.Series))
# Remove all columns except for the text and emotion labels
data_source = data_source.drop(columns=['source', 'VAD', 'split', 'domain', 'labeled', 'optional', 'annotation_procedure',
                      'emotion_model', 'emotions', 'love', 'noemo', 'confusion', 'shame', 'guilt'])


## Preprocessing
The texts are tokenized, shortened to their lemma and stopwords are removed. We do not remove 'not', as it could invert an emotion if standing before an emotion indicator).

In [212]:
def preprocessing(df: pandas.DataFrame):
    toks = list()

    for index, row in df.iterrows():
        # Preparation of stopwords
        stops = set(nltk.corpus.stopwords.words("english"))
        stops.remove("not")
        for char in {"#", "@", "semst", "&", "!", "§", "$", "%", "/", "=", "?", ".", ",", ";", ":", "-", "<", ">", "+", "~", "'", "''", '"', "(",")"}:
            stops.add(char)
        # Tokenization and stemming with NLTK
        stemmer = nltk.stem.LancasterStemmer()
        tokens = [stemmer.stem(word).lower() for word in nltk.tokenize.word_tokenize(df["text"][index])]
        tokens = [word for word in tokens if word not in stops]
        toks.append(tokens)
    # Replacing the raw text with the processed version
    toks = pandas.DataFrame({"text": toks}, index=df["id"])
    return df.assign(text = toks["text"])

data_source = preprocessing(data_source)

### Feature extraction
To complete the preprocessing, the now tokenized texts must be converted into a feature set for each multi-label annotation. Each feature set is comprised of the individual tokens as bigrams, with a one padding dummy on each end.

In [222]:
# Function to set the truth values of occuring words to True
def features(corpus):
    # Output: dictionary of all feature set lists for each emotion
    # {emotion: [(fs1, label), (fs2,label),...],...} with (fs, l)= ({word:True/False, word...}, 1/0)
    out_features = {emotion : list() for emotion in ["joy", "sadness", "trust", "disgust", "anger", "fear", "surprise", "anticipation"]}
    for index in range(1,100):
        # Populating the feature sets as dictionaries with truth values for all words in the training data
        feature_dicts = {word: False for text in data_source["text"] for word in text}
        for word in corpus["text"][index]:
            feature_dicts[word] = True
        for emotion in out_features:
            if corpus[emotion][index] == 1:
                    out_features[emotion].append((feature_dicts, 1))
            else:
                out_features[emotion].append((feature_dicts, 0))
    return out_features


## Training
To train the model, Naive Bayes is used, which is normally used for single label learning.
Therefore, the model is adjusted to learn and compute the probabilities for each emotion separately.
That way, to annotate a text, the classifier for each emotion makes a pass over the text (8 passes).

In [214]:
class MultiLabel:

    def train(self, train_data):
        self.train_set = train_data

        self.joy = nltk.classify.NaiveBayesClassifier.train(train_data["joy"])
        self.sadness = nltk.classify.NaiveBayesClassifier.train(train_data["sadness"])
        self.trust = nltk.classify.NaiveBayesClassifier.train(train_data["trust"])
        self.disgust = nltk.classify.NaiveBayesClassifier.train(train_data["disgust"])
        self.anger = nltk.classify.NaiveBayesClassifier.train(train_data["anger"])
        self.fear = nltk.classify.NaiveBayesClassifier.train(train_data["fear"])
        self.surprise = nltk.classify.NaiveBayesClassifier.train(train_data["surprise"])
        self.anticipation = nltk.classify.NaiveBayesClassifier.train(train_data["anticipation"])

        self.emos = {"joy": self.joy,
                "sadness": self.sadness,
                "trust": self.trust,
                "disgust": self.disgust,
                "anger": self.anger,
                "fear": self.fear,
                "surprise": self.surprise,
                "anticipation": self.anticipation
                }
    def accuracy(self, test_set):
        acc = {
        "joy" : nltk.classify.accuracy(self.joy, test_set["joy"]),
        "sadness" : nltk.classify.accuracy(self.sadness, test_set["sadness"]),
        "trust" : nltk.classify.accuracy(self.trust, test_set["trust"]),
        "disgust" : nltk.classify.accuracy(self.disgust, test_set["disgust"]),
        "anger" : nltk.classify.accuracy(self.anger, test_set["anger"]),
        "fear" : nltk.classify.accuracy(self.fear, test_set["fear"]),
        "surprise" : nltk.classify.accuracy(self.surprise, test_set["surprise"]),
        "anticipation" : nltk.classify.accuracy(self.anticipation, test_set["anticipation"])
            }
        acc["avg"] = (acc["joy"] + acc["sadness"] + acc["trust"] + acc["disgust"] + acc["anger"] + acc["fear"] + acc["surprise"] + acc["anticipation"])/8

        return acc

    def most_informative_feats(self, n=10):
        return {
            "joy" : self.joy.most_informative_features(n),
            "sadness" : self.sadness.most_informative_features(n),
            "trust" : self.trust.most_informative_features(n),
            "disgust" : self.disgust.most_informative_features(n),
            "anger" : self.anger.most_informative_features(n),
            "fear" : self.fear.most_informative_features(n),
            "surprise" : self.surprise.most_informative_features(n),
            "anticipation" : self.anticipation.most_informative_features(n),
        }

    def eval(self, test_set, most_informative_features=True):
        acc = self.accuracy(test_set)

        for emotion in self.emos:
            print(emotion.title() + f":\nAccuracy: {round(acc[emotion], 3)}")
            if most_informative_features:
                self.emos[emotion].show_most_informative_features(n=10)
            print(f"{'_':_<20}")

        def extract_labels(corpus):
            out_lab = list()
            for emotion in corpus:
                single_label = list()
                for i in range(len(corpus[emotion])):
                    if corpus[emotion][i] == 1:
                        single_label.append([1])
                    else: single_label.append([0])
                if not len(out_lab):
                    out_lab.append(single_label)
                else:
                    for i in range(len(out_lab)):
                        out_lab[i].append(single_label[i])
            return out_lab
        
        labels_gold = extract_labels(test_set)
        labels_pred = list()
        for (fs, label) in test_set["joy"]:
            label_set = list()
            for emotion in ["joy", "sadness", "trust", "disgust", "anger", "fear", "surprise", "anticipation"]:
                label_set.append(self.emos[emotion].classify(fs))
            labels_pred.append(label_set)
        accum = 0
        for i in range(len(labels_gold)):
            single = 0
            for j in range(8):
                if labels_gold[i][j]==labels_pred[i][j]:
                    single += 1
            accum += single/8
        acc["accum"] = accum/len(labels_gold)
        print("Accumulated accuracy:\t", round(acc["accum"], 3), "\nMulti-label confusion matrix", sep="")
        #print(mcm(labels_gold, labels_pred, labels=["joy", "sadness", "trust", "disgust", "anger", "fear", "surprise", "anticipation"]))

### Splitting the data
To train the model, 80% of the SSEC corpus is used while the remaining 20% are held out for evaluation

In [215]:
# Splitting the corpus into training and testing data in a 8/2 ratio
split = int(data_source.shape[0]*0.8)
corpus_train = data_source.loc[data_source["id"] <= 21051+split]
corpus_test = data_source.loc[data_source["id"] > 21051+split]

# Dictionary for the training feature sets for all emotions:
train_features = features(corpus_train)

ssec_trained = MultiLabel()
ssec_trained.train(train_features)

## Evaluation
The trained tagger is evaluated first on a held back part of the corpus. As part of the evaluation,
the accuracy and the most informative features for each label are calculated.

In [216]:
test_features = features(corpus_test)
ssec_trained.eval(test_features)


Joy
Most Informative Features
                 lovewin = True                1 : 0      =     23.0 : 1.0
                    amaz = True                1 : 0      =     18.7 : 1.0
                   thank = True                1 : 0      =     16.2 : 1.0
   makeamericagreatagain = True                1 : 0      =     12.2 : 1.0
                     lov = True                1 : 0      =     11.6 : 1.0
                  awesom = True                1 : 0      =     10.0 : 1.0
                   dream = True                1 : 0      =     10.0 : 1.0
                   enjoy = True                1 : 0      =     10.0 : 1.0
                    stat = True                0 : 1      =      8.9 : 1.0
                  effect = True                0 : 1      =      8.4 : 1.0
:
Accuracy: 0.728
____________________
Sadness
Most Informative Features
                  victim = True                1 : 0      =      9.0 : 1.0
                    sham = True                1 : 0      =      8.5 : 1

ValueError: Found input variables with inconsistent numbers of samples: [1, 973]

After evaluating the classifier's performance on data similar to the training data, the same classifier is evaluated
on our own annotated corpus. Before evaluation, we can already predict a bad performance for anticipation, as the tag
is not part of the training data and therefore will never be annotated.

In [223]:
with open("on-off_average_annotation.csv", encoding="utf-8") as f:
    reddit_data = pandas.read_csv(f, sep=";", header=0)

reddit_data = preprocessing(reddit_data)
reddit_features = features(reddit_data)
ssec_trained.eval(reddit_features, False)

Joy
:
Accuracy: 0.434
____________________
Sadness
:
Accuracy: 0.293
____________________
Trust
:
Accuracy: 0.374
____________________
Disgust
:
Accuracy: 0.515
____________________
Anger
:
Accuracy: 0.333
____________________
Fear
:
Accuracy: 0.657
____________________
Surprise
:
Accuracy: 0.606
____________________
Anticipation
:
Accuracy: 0.98
____________________
Accumulated accuracy:	0.0
Multi-label confusion matrix


ValueError: Found input variables with inconsistent numbers of samples: [1, 99]