In [1]:
import collections
import contractions
import nltk
import random
import re

import numpy as np
import pandas as pd

from IPython.display import display
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from wordcloud import STOPWORDS

rng = np.random.RandomState(seed=42)

In [2]:
data = pd.read_csv("Data/tweet_emotions.csv", sep=";")
data_preprocessed = data.copy()

display(data.head(15))

Unnamed: 0,content,sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
5,ive been feeling a little burdened lately wasn...,sadness
6,ive been taking or milligrams or times recomme...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too,love


In [3]:
data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: contractions.fix(x))
data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: re.sub(r"[^a-z\s]+", "", x))

display(data_preprocessed.head(15))

Unnamed: 0,content,sentiment
0,i did not feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,i am grabbing a minute to post i feel greedy w...,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
5,i have been feeling a little burdened lately w...,sadness
6,i have been taking or milligrams or times reco...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too,love


In [4]:
stop_words = STOPWORDS.union(stopwords.words("english")).union(ENGLISH_STOP_WORDS)

data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: nltk.word_tokenize(x))
data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: [token for token in x if token not in stop_words])

display(data_preprocessed.head(15))

Unnamed: 0,content,sentiment
0,"[feel, humiliated]",sadness
1,"[feeling, hopeless, damned, hopeful, cares, aw...",sadness
2,"[grabbing, minute, post, feel, greedy, wrong]",anger
3,"[feeling, nostalgic, fireplace, know, property]",love
4,"[feeling, grouchy]",anger
5,"[feeling, little, burdened, lately, sure]",sadness
6,"[taking, milligrams, times, recommended, falle...",surprise
7,"[feel, confused, life, teenager, jaded, year, ...",fear
8,"[petronas, years, feel, petronas, performed, h...",joy
9,"[feel, romantic]",love


In [5]:
wnl = WordNetLemmatizer()

data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: [wnl.lemmatize(token, pos="v") for token in x])
data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: [token for token in x if "feel" not in token])

display(data_preprocessed.head(15))

Unnamed: 0,content,sentiment
0,[humiliate],sadness
1,"[hopeless, damn, hopeful, care, awake]",sadness
2,"[grab, minute, post, greedy, wrong]",anger
3,"[nostalgic, fireplace, know, property]",love
4,[grouchy],anger
5,"[little, burden, lately, sure]",sadness
6,"[take, milligrams, time, recommend, fall, asle...",surprise
7,"[confuse, life, teenager, jade, year, old, man]",fear
8,"[petronas, years, petronas, perform, huge, pro...",joy
9,[romantic],love


In [6]:
data_preprocessed["content"] = data_preprocessed["content"].apply(lambda x: x if len(x) > 1 else np.nan)
data_preprocessed = data_preprocessed.dropna(subset=["content"])

display(data_preprocessed.head(15))

Unnamed: 0,content,sentiment
1,"[hopeless, damn, hopeful, care, awake]",sadness
2,"[grab, minute, post, greedy, wrong]",anger
3,"[nostalgic, fireplace, know, property]",love
5,"[little, burden, lately, sure]",sadness
6,"[take, milligrams, time, recommend, fall, asle...",surprise
7,"[confuse, life, teenager, jade, year, old, man]",fear
8,"[petronas, years, petronas, perform, huge, pro...",joy
10,"[make, suffer, see, mean]",sadness
11,"[run, divine, experience, expect, type, spirit...",joy
12,"[think, easiest, time, year, dissatisfy]",anger


In [7]:
def find_synonym(word):
    synonyms = [lemma.name() for synset in wordnet.synsets(word) for lemma in synset.lemmas()]
    return synonyms

synonym_dict = {word: tuple(set(syn.casefold() for syn in find_synonym(word))) for word in data_preprocessed["content"].sum()}
synonym_dict = {key: value for key, value in synonym_dict.items() if value}
value_occurrences = collections.Counter(synonym_dict.values())
synonym_dict = {key: value for key, value in synonym_dict.items() if value_occurrences[value] == 1}

display(data_preprocessed["sentiment"].value_counts())
display(list(synonym_dict.items())[:5])

joy         6498
sadness     5515
anger       2575
fear        2288
love        1591
surprise     703
Name: sentiment, dtype: int64

[('hopeless', ('hopeless',)),
 ('damn',
  ('hoot',
   'infernal',
   'darn',
   'beshrew',
   "tinker's_damn",
   'anathemise',
   'bloody',
   'darned',
   'goddam',
   'imprecate',
   'damned',
   'goddamn',
   'all-fired',
   'curse',
   'bedamn',
   'shit',
   'blame',
   'anathemize',
   'blasted',
   'maledict',
   'damn',
   "tinker's_dam",
   'blessed',
   'deuced',
   'shucks',
   'goddamned',
   'blamed',
   'red_cent')),
 ('hopeful',
  ('hopeful',
   'bright',
   'promising',
   'wannabe',
   'aspirant',
   'wannabee',
   'aspirer')),
 ('care',
  ('tending',
   'maintenance',
   'deal',
   'precaution',
   'care',
   'upkeep',
   'handle',
   'give_care',
   'tutelage',
   'caution',
   'aid',
   'fear',
   'manage',
   'charge',
   'like',
   'worry',
   'guardianship',
   'forethought',
   'concern',
   'attention',
   'wish')),
 ('awake',
  ('alive',
   'wake',
   'arouse',
   'awaken',
   'wake_up',
   'come_alive',
   'waken',
   'alert',
   'awake'))]

In [8]:
def data_augmentation(tokens, dictionary):
    tokens = [random.choice(dictionary.get(token, [token])) for token in tokens]
    return tokens

label_count = data_preprocessed["sentiment"].value_counts().to_dict()
max_label_count = max(label_count.values())
data_augmented = pd.DataFrame()

for label, count in label_count.items():
    count_diff = max_label_count - count
    diff_mult = int(np.ceil(count_diff / count))
    if diff_mult:
        data_labeled = data_preprocessed[data_preprocessed["sentiment"] == label]
        data_aug = pd.concat([data_labeled] * diff_mult, ignore_index=True)
        random_sequence = rng.permutation(len(data_aug))[:count_diff]
        data_aug = data_aug.take(random_sequence)
        data_aug["content"] = data_aug["content"].apply(lambda x: data_augmentation(x, synonym_dict))
        data_augmented = pd.concat([data_augmented, data_labeled, data_aug], ignore_index=True)
    else:
        data_augmented = pd.concat([data_augmented, data_preprocessed[data_preprocessed["sentiment"] == label]], ignore_index=True)

display(data_augmented["sentiment"].value_counts())
display(data_augmented.head(15))

joy         6498
sadness     6498
anger       6498
fear        6498
love        6498
surprise    6498
Name: sentiment, dtype: int64

Unnamed: 0,content,sentiment
0,"[petronas, years, petronas, perform, huge, pro...",joy
1,"[run, divine, experience, expect, type, spirit...",joy
2,"[immense, sympathy, general, point, possible, ...",joy
3,"[reassure, anxiety]",joy
4,"[amuse, delight]",joy
5,"[able, help, chai, lifeline, support, encourag...",joy
6,"[superior, dead, chicken, grieve, child]",joy
7,"[giddy, elegant, perfectly, fit, pencil, skirt]",joy
8,"[imagine, real, life, scenario, emotionally, c...",joy
9,"[sure, make, content]",joy


In [9]:
data_augmented["content"] = data_augmented["content"].apply(lambda x: " ".join(x))
data_augmented.to_csv("Data/tweet_emotions_preprocessed.csv")