In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lib.dataset_utils import *
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import os
from transformers import BertTokenizer

In [2]:
train_df, val_df, test_df = load_dataset(DatasetEnum.GoEmotions, k_hot_encode=True)
label_names = train_df.columns[1:]
label_names

Index(['admiration', 'amusement', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'anger', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment'],
      dtype='object')

In [3]:
# Substitute user tags with [NAME]
train_df['text'] = train_df['text'].str.replace(r"(?: ^|\b)/?u/\w+", '[NAME]', regex=True)
val_df['text'] = val_df['text'].str.replace(r"(?: ^|\b)/?u/\w+", '[NAME]', regex=True)
test_df['text'] = test_df['text'].str.replace(r"(?: ^|\b)/?u/\w+", '[NAME]', regex=True)

In [4]:
# Substitute subreddit tags with [LINK]
train_df['text'] = train_df['text'].str.replace(r"(?: ^|\b)/?r\/\w+", '[LINK]', regex=True)
val_df['text'] = val_df['text'].str.replace(r"(?: ^|\b)/?r\/\w+", '[LINK]', regex=True)
test_df['text'] = test_df['text'].str.replace(r"(?: ^|\b)/?r\/\w+", '[LINK]', regex=True)

In [5]:
train_df.loc[21639]['text']

'For your kindness to mobile users I give a platinum ⠀⠀⠀⠀⠀⣤⣶⣶⡶⠦⠴⠶⠶⠶⠶⡶⠶⠦⠶⠶⠶⠶⠶⠶⠶⣄⠀⠀⠀⠀ ⠀⠀⠀⠀⠀⣿⣀⣀⣀⣀⠀⢀⣤⠄⠀⠀⣶⢤⣄⠀⠀⠀⣤⣤⣄⣿⠀⠀⠀⠀ ⠀⠀⠀⠀⠀⠿⣿⣿⣿⣿⡷⠋⠁⠀⠀⠀⠙⠢⠙⠻⣿⡿⠿⠿⠫⠋⠀⠀⠀⠀ ⠀⠀⠀⠀⠀⠀⢀⣤⠞⠉⠀⠀⠀⠀⣴⣶⣄⠀⠀⠀⢀⣕⠦⣀⠀⠀⠀⠀⠀⠀ ⠀⠀⠀⢀⣤⠾⠋⠁⠀⠀⠀⠀⢀⣼⣿⠟⢿⣆⠀⢠⡟⠉⠉⠊⠳⢤⣀⠀⠀⠀ ⠀⣠⡾⠛⠁⠀⠀⠀⠀⠀⢀⣀⣾⣿⠃⠀⡀⠹⣧⣘⠀⠀⠀⠀⠀⠀⠉⠳⢤⡀ ⠀⣿⡀⠀⠀⢠⣶⣶⣿⣿⣿⣿⡿⠁⠀⣼⠃⠀⢹⣿⣿⣿⣶⣶⣤⠀⠀⠀⢰⣷ ⠀⢿⣇⠀⠀⠈⠻⡟⠛⠋⠉⠉⠀⠀⡼⠃⠀⢠⣿⠋⠉⠉⠛⠛⠋⠀⢀⢀⣿⡏ ⠀⠘⣿⡄⠀⠀⠀⠈⠢⡀⠀⠀⠀⡼⠁⠀⢠⣿⠇⠀⠀⡀⠀⠀⠀⠀⡜⣼⡿⠀ ⠀⠀⢻⣷⠀⠀⠀⠀⠀⢸⡄⠀⢰⠃⠀⠀⣾⡟⠀⠀⠸⡇⠀⠀⠀⢰⢧⣿⠃⠀ ⠀⠀⠘⣿⣇⠀⠀⠀⠀⣿⠇⠀⠇⠀⠀⣼⠟⠀⠀⠀⠀⣇⠀⠀⢀⡟⣾⡟⠀⠀ ⠀⠀⠀⢹⣿⡄⠀⠀⠀⣿⠀⣀⣠⠴⠚⠛⠶⣤⣀⠀⠀⢻⠀⢀⡾⣹⣿⠃⠀⠀ ⠀⠀⠀⠀⢿⣷⠀⠀⠀⠙⠊⠁⠀⢠⡆⠀⠀⠀⠉⠛⠓⠋⠀⠸⢣⣿⠏⠀⠀⠀ ⠀⠀⠀⠀⠘⣿⣷⣦⣤⣤⣄⣀⣀⣿⣤⣤⣤⣤⣤⣄⣀⣀⣀⣀⣾⡟⠀⠀⠀⠀ ⠀⠀⠀⠀⠀⢹⣿⣿⣿⣻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠁⠀⠀⠀⠀ ⠀⠀⠀⠀⠀⠀⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠛⠃'

In [6]:
# remove one ascii
train_df.loc[21639, 'text'] = train_df.loc[21639, 'text'][:51]

In [7]:
# count token count distribution using bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
token_counts_dict = {}
for name, dataset in zip(['train', 'val', 'test'], [train_df, val_df, test_df]):
    token_counts = []
    for text in dataset['text']:
        tokens = tokenizer.tokenize(text)
        token_counts.append(len(tokens))
    token_counts_dict[name] = token_counts



many tokens usually were due to useless repetitions of same character for > 10 times

In [8]:
# cap long sequences of same character (max 5)
train_df['text'] = train_df['text'].str.replace(r'(.)\1{5,}', r'\1\1\1\1\1', regex=True)
val_df['text'] = val_df['text'].str.replace(r'(.)\1{5,}', r'\1\1\1\1\1', regex=True)
test_df['text'] = test_df['text'].str.replace(r'(.)\1{5,}', r'\1\1\1\1\1', regex=True)

In [9]:
#get sentences with high token count
occ = train_df[[el > 55 for el in token_counts_dict['train']]]
occ

Unnamed: 0,text,admiration,amusement,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,...,sadness,surprise,neutral,annoyance,approval,caring,confusion,curiosity,desire,disappointment
13412,"ackchyually, it's *[LINK] ^^^^^I'm ^^^^^a ^^^^...",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
20785,Oh fuuuuuck RIGHT OFF!!!!!,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26156,">It also means no applause lines. No problem, ...",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
28694,here you go: |Games|Home|Away|Team|vs W-L 18-1...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
train_df.loc[28694, 'text'] = train_df.loc[28694, 'text'][:37]
train_df.loc[13412, 'text'] = train_df.loc[13412, 'text'].replace('^^^^^', '')

## Saving results

In [11]:
OUT_DIR = "./dataset/GoEmotionsCleaned/"

In [12]:
# Save the cleaned dataset
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
train_df.to_csv(OUT_DIR + "train.tsv", sep='\t', index=False)
val_df.to_csv(OUT_DIR + "val.tsv", sep='\t', index=False)
test_df.to_csv(OUT_DIR + "test.tsv", sep='\t', index=False)