In [None]:
!pip install emoji

In [1]:
import emoji
import nltk
import pandas as pd
import re
import requests

from transformers import AutoTokenizer, AutoModelWithLMHead

# Data Uploading

In [22]:
url = "https://raw.githubusercontent.com/SaraHoxha/emotion-detection-txa/refs/heads/main/Data%20Preprocessing/merged_df_flitered_version.csv"
df = pd.read_csv(url)

In [None]:
df['filtered_text'] = df['filtered_text'].fillna('').astype(str) # Replace nulls with blanks

# Data Cleaning

This step is specific to this model, texts larger than the maximum character limit allowed by T5, which is 510, will be removed.

In [None]:
def count_characters(text):
    return len(text)

df['character_count'] = df['filtered_text'].apply(count_characters)

In [5]:
df = df[df['character_count'] <= 510] # Drop bigger rows

df

Unnamed: 0,id,text,created_utc,likes,source,filtered_text,tokens,character_count
0,1,easy ways to stop video conference screen free...,2024-10-14 14:59:40,0,Twitter,easy way stop video conference screen freeze o...,"['easy', 'way', 'stop', 'video', 'conference',...",235
1,2,in the age of remoteworking you can work from ...,2024-10-14 14:05:20,0,Twitter,age remoteworke work need start,"['age', 'remoteworke', 'work', 'need', 'start']",92
2,3,after the pandemic of covid remote working is ...,2024-10-14 12:30:23,0,Twitter,pandemic covid remote working increase globe b...,"['pandemic', 'covid', 'remote', 'working', 'in...",258
3,4,do employee monitoring tools have any place in...,2024-10-14 12:22:40,0,Twitter,employee monitoring tool have place remote wor...,"['employee', 'monitoring', 'tool', 'have', 'pl...",209
4,5,a little paint lots of productivity and a whol...,2024-10-14 12:22:03,0,Twitter,little paint lot productivity whole new vibe c...,"['little', 'paint', 'lot', 'productivity', 'wh...",229
...,...,...,...,...,...,...,...,...
177810,177811,call center jobs are mostly remote these days,2022-06-12 04:02:05,3,Reddit,call center job remote day,"['call', 'center', 'job', 'remote', 'day']",45
177811,177812,i think data entry jobs also fit this criteria,2022-06-12 18:01:16,1,Reddit,think data entry job fit criterion,"['think', 'data', 'entry', 'job', 'fit', 'crit...",46
177812,177813,it work not all of them require a degree i nev...,2022-06-22 18:22:45,1,Reddit,work require degree get have 20 year experience,"['work', 'require', 'degree', 'get', 'have', '...",88
177813,177814,never done data entry what all does it involve?,2022-06-14 15:24:38,2,Reddit,do data entry involve question,"['do', 'data', 'entry', 'involve', 'question']",47


# Model Implementation

For this step, Google's T5 Base fine-tuned model obtained from HuggingFace (https://huggingface.co/mrm8488/t5-base-finetuned-emotion) will be implemented to create the ground truth of the emotion labels. In this case, the emotions belonging to the possible labels match Parrot's emotions.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")

def get_emotion(text): # Emotions predictor
    input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')
    output = model.generate(input_ids=input_ids, max_length=2)
    dec = [tokenizer.decode(ids) for ids in output]
    label = dec[0]
    return label

df['predicted_label'] = df['filtered_text'].apply(get_emotion)



In [25]:
df.head()

Unnamed: 0,id,text,created_utc,likes,source,filtered_text,tokens,character_count,predicted_label
0,1,easy ways to stop video conference screen free...,2024-10-14 14:59:40,0,Twitter,easy way stop video conference screen freeze o...,"['easy', 'way', 'stop', 'video', 'conference',...",205,<pad> joy
1,2,in the age of remoteworking you can work from ...,2024-10-14 14:05:20,0,Twitter,age remoteworke work need start,"['age', 'remoteworke', 'work', 'need', 'start']",31,<pad> fear
2,3,after the pandemic of covid remote working is ...,2024-10-14 12:30:23,0,Twitter,pandemic covid remote working increase globe b...,"['pandemic', 'covid', 'remote', 'working', 'in...",182,<pad> fear
3,4,do employee monitoring tools have any place in...,2024-10-14 12:22:40,0,Twitter,employee monitoring tool have place remote wor...,"['employee', 'monitoring', 'tool', 'have', 'pl...",149,<pad> anger
4,5,a little paint lots of productivity and a whol...,2024-10-14 12:22:03,0,Twitter,little paint lot productivity whole new vibe c...,"['little', 'paint', 'lot', 'productivity', 'wh...",170,<pad> joy


In [26]:
df.value_counts('predicted_label')

predicted_label
<pad> joy            75261
<pad> anger          53628
<pad> fear           28331
<pad> sadness        13133
<pad> surprise        1826
<pad> love             810
<pad> interest         203
<pad> 100               53
<pad> remote            16
<pad> cat               12
<pad> not               10
<pad> commercial         8
<pad> tech               5
<pad> double             5
<pad> dream              4
<pad> vitamin            4
<pad> hybrid             3
<pad> treadmill          3
<pad> legal              3
<pad> bait               2
<pad> clutch             2
<pad> hunger             1
<pad> bus                1
<pad> 99                 1
<pad> union              1
<pad> tornado            1
<pad> blanket            1
<pad> blue               1
<pad> sub                1
<pad> standard           1
<pad> ski                1
<pad> silent             1
<pad> silence            1
<pad> rock               1
<pad> grape              1
<pad> candle             1
<pad> piano 

In [27]:
df[df['predicted_label'] == '<pad> interest']

Unnamed: 0,id,text,created_utc,likes,source,filtered_text,tokens,character_count,predicted_label
20792,20793,interested,2024-02-10 04:00:21,1,Reddit,interested,['interested'],10,<pad> interest
23121,23122,would you be interested in spaghetti tutoring?,2023-10-14 02:32:54,1,Reddit,interested spaghetti tutoring question,"['interested', 'spaghetti', 'tutoring', 'quest...",38,<pad> interest
44298,44299,interested,2024-07-15 17:30:10,0,Reddit,interested,['interested'],10,<pad> interest
46981,46982,id be incredibly interested in that,2023-04-10 13:40:55,3,Reddit,d interested,"['d', 'interested']",12,<pad> interest
47203,47204,interested,2024-06-12 11:17:29,2,Reddit,interested,['interested'],10,<pad> interest
...,...,...,...,...,...,...,...,...,...
176250,176251,for anyone who is interested in the zbd positions,2024-09-09 20:14:02,2,Reddit,interested zbd position,"['interested', 'zbd', 'position']",23,<pad> interest
176489,176490,hey i am interested,2024-04-23 21:32:56,1,Reddit,interested,['interested'],10,<pad> interest
176703,176704,i am interested,2023-10-08 09:02:48,1,Reddit,interested,['interested'],10,<pad> interest
176883,176884,i am interested,2023-01-12 05:55:34,1,Reddit,interested,['interested'],10,<pad> interest


In [29]:
df_filtered = df 
df_filtered['predicted_label'] = df_filtered['predicted_label'].str.replace(r'^<pad>\s*', '', regex=True)

As observed above, there are labels that do not correspond to real emotions, since the model assigns the most frequent non-stopword as the label when there is not enough information no assign a emotion. These observations usually do not provide emotional context, so they will be removed.

In [17]:
keywords = ['fear', 'joy', 'sadness', 'surprise', 'anger', 'love', 'interest' ] 
pattern = '|'.join(keywords)  # Create a pattern to search for any keyword

df_filtered = df_filtered[df_filtered['predicted_label'].str.contains(pattern, case=False, regex=True)]  # Retain only rows labeled as Parrot's emotions
df_filtered

Unnamed: 0,id,text,created_utc,likes,source,filtered_text,tokens,character_count,predicted_label
0,1,easy ways to stop video conference screen free...,2024-10-14 14:59:40,0,Twitter,easy way stop video conference screen freeze o...,"['easy', 'way', 'stop', 'video', 'conference',...",235,anger
1,2,in the age of remoteworking you can work from ...,2024-10-14 14:05:20,0,Twitter,age remoteworke work need start,"['age', 'remoteworke', 'work', 'need', 'start']",92,joy
2,3,after the pandemic of covid remote working is ...,2024-10-14 12:30:23,0,Twitter,pandemic covid remote working increase globe b...,"['pandemic', 'covid', 'remote', 'working', 'in...",258,fear
3,4,do employee monitoring tools have any place in...,2024-10-14 12:22:40,0,Twitter,employee monitoring tool have place remote wor...,"['employee', 'monitoring', 'tool', 'have', 'pl...",209,anger
4,5,a little paint lots of productivity and a whol...,2024-10-14 12:22:03,0,Twitter,little paint lot productivity whole new vibe c...,"['little', 'paint', 'lot', 'productivity', 'wh...",229,joy
...,...,...,...,...,...,...,...,...,...
177810,177811,call center jobs are mostly remote these days,2022-06-12 04:02:05,3,Reddit,call center job remote day,"['call', 'center', 'job', 'remote', 'day']",45,fear
177811,177812,i think data entry jobs also fit this criteria,2022-06-12 18:01:16,1,Reddit,think data entry job fit criterion,"['think', 'data', 'entry', 'job', 'fit', 'crit...",46,joy
177812,177813,it work not all of them require a degree i nev...,2022-06-22 18:22:45,1,Reddit,work require degree get have 20 year experience,"['work', 'require', 'degree', 'get', 'have', '...",88,joy
177813,177814,never done data entry what all does it involve?,2022-06-14 15:24:38,2,Reddit,do data entry involve question,"['do', 'data', 'entry', 'involve', 'question']",47,fear


In [None]:
csv_path = ''
df_filtered.to_csv(csv_path, index=False)