# Overview
 - The GoEmotions dataset provides a large, manually annotated, dataset for fine-grained emotion prediction.
 - The dataset is curated by Google high coverage of the emotions expressed in Reddit comments.
 - The Dataset contains 28 different Emotions

### Find more at this [Blog](https://blog.research.google/2021/10/goemotions-dataset-for-fine-grained.html)

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
!pip install emoji
!pip install cleantext
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


### 1. Loading important Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from bs4 import BeautifulSoup
import os
import nltk
import re
import string
import json
from cleantext import clean
import emoji
import string
import seaborn as sns
from tqdm import tqdm
import contractions

### 2. Loading the Data

- Use the Hugging face `Datasets` library to get it in 3 splits ( `train`, `test`, `validation` )
- Use `kaggle` accounts to get the data from [here](https://www.kaggle.com/datasets/shivamb/go-emotions-google-emotions-dataset)

The data can be found [here](https://huggingface.co/datasets/go_emotions)

In [44]:
train = pd.read_csv("/content/drive/MyDrive/NLP/Datasets/go_train.csv").drop(['id'], axis = 1)
test  = pd.read_csv("/content/drive/MyDrive/NLP/Datasets/go_test.csv").drop(['id'], axis = 1)
val  = pd.read_csv("/content/drive/MyDrive/NLP/Datasets/go_val.csv").drop(['id'], axis = 1)

In [None]:
import operator
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x

def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path, encoding="utf-8") as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

def remove_emojis(text: str) -> str:
    return ''.join(c for c in text if c not in emoji.EMOJI_DATA)

glove_embeddings = load_embeddings(r"/content/drive/MyDrive/NLP/resources/glove.840B.300d.txt")

white_list = string.ascii_letters + string.digits + ' '
white_list += "'-"
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
review_chars = build_vocab(list(train.text))
review_symbols = ''.join([c for c in review_chars if not c in white_list])
symbols_to_delete = ''.join([c for c in review_symbols if not c in glove_symbols])


100%|██████████| 2196007/2196007 [00:00<00:00, 3191111.09it/s]
100%|██████████| 43410/43410 [00:00<00:00, 127902.25it/s]


In [33]:
vocab = build_vocab(list(train.text.apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 43410/43410 [00:00<00:00, 173318.51it/s]
100%|██████████| 56729/56729 [00:00<00:00, 478353.30it/s]


Found embeddings for 52.68% of vocab
Found embeddings for  88.28% of all text


[('[NAME]', 5743),
 ('I’m', 934),
 ('[NAME].', 786),
 ("That's", 587),
 ('don’t', 563),
 ('[NAME],', 510),
 ('it’s', 476),
 ('It’s', 339),
 ('That’s', 335),
 ("isn't", 283)]

In [48]:
punc = '!?"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'
def preprocessing1(cleantext):
    cleantext = cleantext.lower()
    cleantext = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = contractions.fix(cleantext)
    cleantext = "".join([ cleantext[i] for i in range(len(cleantext)) if cleantext[i] not in symbols_to_delete])
    clean_text = remove_emojis(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext.split()

def preprocessing2(cleantext):
    cleantext = cleantext.lower()
    cleantext = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = contractions.fix(cleantext)
    cleantext = "".join([ cleantext[i] for i in range(len(cleantext)) if cleantext[i] not in symbols_to_delete])
    clean_text = remove_emojis(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext

vocab = build_vocab(list(train.text.apply(preprocessing1)))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 43410/43410 [00:00<00:00, 221100.26it/s]
100%|██████████| 28657/28657 [00:00<00:00, 479130.72it/s]


Found embeddings for 85.23% of vocab
Found embeddings for  99.16% of all text


[('incels', 21),
 ('incel', 21),
 ('brexit', 19),
 ('fortnite', 19),
 ('🤔', 19),
 ('people’s', 15),
 ('altright', 14),
 ('shitposting', 11),
 ('😁', 11),
 ('͡°', 10)]

In [49]:
train.text = train.text.apply(preprocessing2)

In [50]:
vocab = build_vocab(list(test.text.apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 5427/5427 [00:00<00:00, 151773.19it/s]
100%|██████████| 14019/14019 [00:00<00:00, 427978.37it/s]

Found embeddings for 67.32% of vocab
Found embeddings for  88.29% of all text





[('[NAME]', 745),
 ('I’m', 122),
 ('[NAME].', 91),
 ('it’s', 64),
 ("That's", 62),
 ('don’t', 62),
 ('[NAME],', 57),
 ('It’s', 55),
 ("he's", 40),
 ('you!', 39)]

In [52]:
white_list = string.ascii_letters + string.digits + ' '
white_list += "'-"
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
review_chars = build_vocab(list(test.text))
review_symbols = ''.join([c for c in review_chars if not c in white_list])
symbols_to_delete = ''.join([c for c in review_symbols if not c in glove_symbols])
def preprocessing1(text):
    cleantext = text.lower()
    x = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = "".join([ x[i] for i in range(len(x)) if x[i] not in symbols_to_delete])
    cleantext = contractions.fix(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext.split()

def preprocessing2(text):
    cleantext = text.lower()
    x = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = "".join([ x[i] for i in range(len(x)) if x[i] not in symbols_to_delete])
    cleantext = contractions.fix(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext

vocab = build_vocab(list(test.text.apply(preprocessing1)))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 2196007/2196007 [00:00<00:00, 3367473.75it/s]
100%|██████████| 5427/5427 [00:00<00:00, 124359.36it/s]
100%|██████████| 5427/5427 [00:00<00:00, 171444.30it/s]
100%|██████████| 8706/8706 [00:00<00:00, 320959.92it/s]

Found embeddings for 94.02% of vocab
Found embeddings for  99.23% of all text





[('brexit', 7),
 ('incel', 4),
 ('rfunny', 3),
 ('fortnite', 3),
 ('people’s', 3),
 ('incels', 3),
 ('lightspamming', 2),
 ('hbomb', 2),
 ('pwbpd', 2),
 ('citycounty', 2)]

In [53]:
test.text = test.text.apply(preprocessing2)

In [54]:
vocab = build_vocab(list(val.text.apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 5426/5426 [00:00<00:00, 134971.14it/s]
100%|██████████| 14240/14240 [00:00<00:00, 389039.43it/s]

Found embeddings for 66.91% of vocab
Found embeddings for  88.31% of all text





[('[NAME]', 711),
 ('I’m', 119),
 ('[NAME].', 96),
 ("That's", 76),
 ('it’s', 65),
 ('[NAME],', 65),
 ('It’s', 56),
 ('don’t', 56),
 ('That’s', 44),
 ("You're", 43)]

In [55]:
white_list = string.ascii_letters + string.digits + ' '
white_list += "'-"
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
review_chars = build_vocab(list(val.text))
review_symbols = ''.join([c for c in review_chars if not c in white_list])
symbols_to_delete = ''.join([c for c in review_symbols if not c in glove_symbols])
def preprocessing1(text):
    cleantext = text.lower()
    x = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = "".join([ x[i] for i in range(len(x)) if x[i] not in symbols_to_delete])
    cleantext = contractions.fix(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext.split()

def preprocessing2(text):
    cleantext = text.lower()
    x = cleantext.replace("[NAME]", "X").replace("[name]", "X").replace(".", "").replace(",", "").replace('"', "").replace("!!", "!").replace("“", "").replace("”", "").replace("remindme", "remind me").replace("altright", "alright")
    cleantext = "".join([ x[i] for i in range(len(x)) if x[i] not in symbols_to_delete])
    cleantext = contractions.fix(cleantext)
    cleantext = cleantext.translate(str.maketrans("", "", punc))
    return cleantext

vocab = build_vocab(list(val.text.apply(preprocessing1)))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

100%|██████████| 2196007/2196007 [00:01<00:00, 1279031.42it/s]
100%|██████████| 5426/5426 [00:00<00:00, 55300.19it/s]
100%|██████████| 5426/5426 [00:00<00:00, 151405.02it/s]
100%|██████████| 8890/8890 [00:00<00:00, 311829.82it/s]

Found embeddings for 93.98% of vocab
Found embeddings for  99.22% of all text





[('chiiiiika', 4),
 ('90df', 3),
 ('shitpost', 3),
 ('acopia', 2),
 ('laop', 2),
 ('tlj', 2),
 ('one’s', 2),
 ('juul', 2),
 ('incel', 2),
 ('thibs', 2)]

In [56]:
val.text = val.text.apply(preprocessing2)

In [57]:
def labelPreprocess(text):
    text = text.replace("[ ", "").replace("[", "").replace("]", "").split(" ")[0]
    return int(text)

In [58]:
train.labels = train.labels.apply(labelPreprocess)
test.labels = test.labels.apply(labelPreprocess)
val.labels = val.labels.apply(labelPreprocess)

In [59]:
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_test_text_processed", test.text.to_numpy())
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_train_text_processed", train.text.to_numpy())
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_val_text_processed", val.text.to_numpy())
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_test_labels", test.labels.to_numpy())
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_train_labels", train.labels.to_numpy())
np.savez_compressed("/content/drive/MyDrive/NLP/Datasets/goemotion_val_labels", val.labels.to_numpy())