In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('fivethirtyeight')

In [2]:
df1 = pd.read_csv('../data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('../data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('../data/full_dataset/goemotions_3.csv')

df = pd.concat([df1, df2, df3]).reset_index(drop=True)

In [3]:
useful_cols = ['id','admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']

In [4]:
df.example_very_unclear.value_counts()

example_very_unclear
False    207814
True       3411
Name: count, dtype: int64

In [5]:
print('No of tweet ids:', df['id'].nunique())
print('No of unique rater ids:', df['rater_id'].nunique())
print('No of unique tweets:', df['text'].nunique())
print('Unclear/ Difficult to label tweets:', np.round((df['example_very_unclear'].sum()*100 / len(df)), 2), '%')

aggregated = df[useful_cols].groupby('id').sum()
# if even one axis has more than 2 True
raters_2 = (aggregated >= 2).any(axis=1).sum()
raters_3 = (aggregated >= 3).any(axis=1).sum()

print("Number of tweets where at least 2+ raters agree upon 1 label:", raters_2)
print("Number of tweets where at least 3+ raters agree upon 1 label:", raters_3)

prop = df.groupby('id')['rater_id'].nunique().value_counts(normalize=True)*100
print("\nNo of raters per tweet (id): \n", prop)

No of tweet ids: 58011
No of unique rater ids: 82
No of unique tweets: 57732
Unclear/ Difficult to label tweets: 1.61 %
Number of tweets where at least 2+ raters agree upon 1 label: 54263
Number of tweets where at least 3+ raters agree upon 1 label: 17763

No of raters per tweet (id): 
 rater_id
3    64.358484
5    30.873455
4     3.626898
2     1.020496
1     0.120667
Name: proportion, dtype: float64


In [6]:
emotion_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

reset_cols = ['id', 'text', 'admiration', 'amusement', 'anger', 'annoyance', 
       'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [7]:
def first_preprocessing(text):
    all_punctuations = '''!-{}\,<>./?@#$%^&*_~`|()'''
    # >1 same punctuations replaced by same punctuation
    cleaned_text = re.sub(f'([{re.escape(all_punctuations)}])\s*\\1*', r'\1 ', text)
    remove_punctuations = '''{}\<>/@#$%^&*_~`|()'''
    # removing less occuring/ noisy punctuations
    cleaned_text = ''.join(char for char in cleaned_text if char not in remove_punctuations)
    # remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    # make sure there is no extra space after sentence complete
    if len(cleaned_text) != 0:
        if cleaned_text[-1] == ' ':
            return cleaned_text[:-1]

    return cleaned_text

In [8]:
new_df = df.copy()
new_df = new_df[new_df['example_very_unclear']==False].reset_index(drop=True)
new_df = new_df[['id', 'text']][~new_df[['id']].duplicated()]
new_df = pd.merge(df[useful_cols].groupby('id').sum() >= 2, new_df, on='id')
new_df = new_df[new_df.drop(columns={'text', 'id'}).sum(axis=1) >= 1]

new_df = new_df[reset_cols]
new_df['text'] = new_df['text'].apply(first_preprocessing)
new_df[emotion_cols] = new_df[emotion_cols].astype(int)

df1 = new_df[~new_df.duplicated('text', keep=False)].reset_index(drop=True).drop(columns={'id'})
df2 = new_df[new_df.duplicated('text', keep=False)].reset_index(drop=True)
df2 = (df2.drop(columns={'id'})[df2.duplicated('text', keep=False)].groupby('text').sum() >= 2).reset_index()

final_df = pd.concat([df1, df2]).reset_index(drop=True)
print('No of examples after preprocessing:', len(final_df))

No of examples after preprocessing: 53951


In [9]:
l_index, u_index = [], []
for i, text in enumerate(final_df['text']):
    if len((text).split()) < 3:
        l_index.append(i)
    if len((text).split()) > 30:
        u_index.append(i)

print('No of texts with less than 3 words:', len(l_index))
print('No of texts with more than 30 words:', len(u_index))

No of texts with less than 3 words: 1117
No of texts with more than 30 words: 9


In [14]:
final_df.to_csv('../data/cleaned_data.csv', index=False)