In [183]:
import pandas as pd

df1 = pd.read_csv('../data/raw/goemotions_1.csv')
df2 = pd.read_csv('../data/raw/goemotions_2.csv')
df3 = pd.read_csv('../data/raw/goemotions_3.csv')

df = pd.concat([df1, df2, df3])

## Labels mapping

In [184]:
import json

with open('../data/mapping/sentiment_dict.json', mode='r', encoding='utf-8') as f:
    map_data = json.load(f)

In [185]:
map_data

{'positive': ['amusement',
  'excitement',
  'joy',
  'love',
  'desire',
  'optimism',
  'caring',
  'pride',
  'admiration',
  'gratitude',
  'relief',
  'approval'],
 'negative': ['fear',
  'nervousness',
  'remorse',
  'embarrassment',
  'disappointment',
  'sadness',
  'grief',
  'disgust',
  'anger',
  'annoyance',
  'disapproval'],
 'ambiguous': ['realization', 'surprise', 'curiosity', 'confusion']}

In [186]:
label_cols = df.drop(columns=['rater_id']).select_dtypes(int).columns

In [187]:
def map_to_3classes(row):
    labels_on = set(label for label in label_cols if row[label] == 1)

    has_pos = len(labels_on & set(map_data['positive'])) > 0
    has_neg = len(labels_on & set(map_data['negative'])) > 0

    if has_neg:
        return 2
    elif has_pos:
        return 0
    else:
        return 1

In [None]:
df['sentiment'] = df.apply(map_to_3classes, axis=1)

In [None]:
df.sentiment.value_counts()

sentiment
1    79349
0    77078
2    54798
Name: count, dtype: int64

In [None]:
group_label_count = (
    df.groupby('text')['sentiment']
      .nunique()
      .reset_index(name='n_unique_sentiments')
)

group_label_count

Unnamed: 0,text,n_unique_sentiments
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",3
1,"""What do Scottish people look like?"" How I wo...",2
2,"### A surprise, to be sure, but a welcome one",1
3,"'*Pray*, v. To ask that the laws of the unive...",1
4,">it'll get invaded by tankie, unfortunately. ...",1
...,...,...
57727,ü§∑üèª‚Äç‚ôÄÔ∏è As a wise man once said: he was a bastar...,2
57728,ü§∑üèº‚Äç‚ôÄÔ∏è I was wondering the same thing. It looke...,2
57729,ü¶ÄMY BABYS DEADü¶Ä,2
57730,ü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶ÄI‚Äôm bad at this gameü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Äü¶Ä,1


In [None]:
conflict_texts = group_label_count[group_label_count['n_unique_sentiments'] > 1]['text']
conflict_df = df[df['text'].isin(conflict_texts)]

In [None]:
len(conflict_texts) / df['text'].nunique()

0.5758504815353703

In [None]:
conflict_df.groupby('text')['sentiment'].apply(lambda x: sorted(x.unique())).value_counts()

sentiment
[0, 1]       12481
[1, 2]       10484
[0, 1, 2]     5933
[0, 2]        4347
Name: count, dtype: int64

## Deduplicate

In [None]:
from collections import Counter

def resolve_group(sentiments):
    c = Counter(sentiments)
    total = sum(c.values())

    labels = set(c.keys())

    ratios = {k: v / total for k, v in c.items()}
    max_label, max_ratio = max(ratios.items(), key=lambda x: x[1])

    # Case A: POS/NEU or NEU/NEG
    if labels in ({0, 1}, {1, 2}):
        if max_ratio > 0.5:
            return max_label, False, max_ratio
        else:
            return None, True, max_ratio

    # Case B: POS vs NEG
    if labels == {0, 2}:
        if max_ratio >= 0.7:
            return max_label, False, max_ratio
        else:
            return None, True, max_ratio

    # Case C: POS vs NEU vs NEG
    if labels == {0, 1, 2}:
        if max_ratio >= 0.5:
            return max_label, False, max_ratio
        else:
            return None, True, max_ratio

    # No conflict (single label)
    return max_label, False, 1.0

In [None]:
resolved = df.groupby('text')['sentiment'].apply(resolve_group).reset_index()
resolved[['sentiment', 'is_uncertain', 'confidence']] = pd.DataFrame(resolved['sentiment'].tolist(), index=resolved.index)

In [None]:
uncertain_df = resolved[resolved['is_uncertain']]

In [None]:
uncertain_df

Unnamed: 0,text,sentiment,is_uncertain,confidence
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",,True,0.400000
8,Calm down and relax are the worst things to s...,,True,0.600000
19,"Luckily from him, there is no death penalty i...",,True,0.400000
22,"No way, man. We're gonna keep on rockin' fore...",,True,0.666667
37,"cute, new driver still cares. Come back in 4 ...",,True,0.400000
...,...,...,...,...
57704,üòÇüòÇüòÇüòÇ please [NAME] tell me that was in Titusvi...,,True,0.666667
57708,üòÖ sorry. Lol,,True,0.333333
57713,üò• I feel so sorry. He'd be proud of you. *Hugs*,,True,0.666667
57714,üò≠ I wanted her to be on the top 4; now we‚Äôve h...,,True,0.666667


In [None]:
resolved[resolved['sentiment'].notna()].sentiment.value_counts(normalize=True)

sentiment
1.0    0.407618
0.0    0.362323
2.0    0.230059
Name: proportion, dtype: float64

In [None]:
resolved.to_csv('../data/log/rule_labeled.csv', encoding='utf-8', index=False)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax
import urllib.request
import csv
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

labels = []
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)         
model.eval()

def predict(text):
  encoded_input = tokenizer(text, return_tensors='pt')
  encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

  with torch.no_grad():
      output = model(**encoded_input)

  scores = output.logits[0].cpu().numpy()
  scores = softmax(scores)

  return scores


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [None]:
MODEL_LABEL_TO_DATASET_LABEL = np.array([2, 1, 0])
MARGIN = 0.3

def auto_label(text: str) -> pd.Series:
    scores = np.asarray(predict(text), dtype=float)

    top1 = scores.argmax()
    top2 = np.partition(scores, -2)[-2:]
    gap = top2.max() - top2.min()

    if gap > MARGIN:
        return pd.Series(
            {
                "sentiment": int(MODEL_LABEL_TO_DATASET_LABEL[top1]),
                "is_uncertain": False,
            }
        )

    return pd.Series(
        {
            "sentiment": pd.NA,
            "is_uncertain": True,
        }
    )

In [None]:
from tqdm import tqdm
tqdm.pandas()

mask = resolved['is_uncertain']

resolved.loc[mask, ['sentiment', 'is_uncertain']] = (
    resolved.loc[mask, 'text']
    .progress_apply(auto_label)
)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6645/6645 [01:43<00:00, 64.21it/s]
  resolved.loc[mask, ['sentiment', 'is_uncertain']] = (


In [None]:
resolved.to_csv('../data/log/soft_labeled.csv', index=False, encoding='utf-8')

## Split

In [98]:
import pandas as pd

df = pd.read_csv('../data/labeled/total.csv')

In [100]:
from sklearn.model_selection import train_test_split

df_train, df_val_test, _, _ = train_test_split(df, df.sentiment, test_size=0.3, random_state=42)
df_val, df_test, _, _ = train_test_split(df_val_test, df_val_test.sentiment, test_size=0.5, random_state=42)

In [101]:
df_train.to_csv('../data/labeled/train.csv', encoding='utf-8', index=False)
df_val.to_csv('../data/labeled/val.csv', encoding='utf-8', index=False)
df_test.to_csv('../data/labeled/test.csv', encoding='utf-8', index=False)

## Pipeline

In [102]:
import pandas as pd

train_df = pd.read_csv('../data/labeled/train.csv', encoding='utf-8')
val_df = pd.read_csv('../data/labeled/val.csv', encoding='utf-8')

In [103]:
has_mention = train_df["text"].str.contains(r"(?<!\w)@[A-Za-z_][A-Za-z0-9_]{1,30}", regex=True, na=False)

train_df[has_mention].text.to_list()

['>@teamYouTube ok and thanks',
 'Replace that friend with a whole pile of desillusioned friends and you might as well @me',
 '@lies_about_flossing, sorry if I used the wrong pronoun.',
 'Reach out to @teamyoutube on twitter and explain what is going on.']

In [104]:
has_hashtag = train_df["text"].str.contains(r"#\w[\w-]*", regex=True, na=False)

train_df[has_hashtag].text.to_list()[:5]

["> Update they would've never been arrested or charged if they weren't black. Epitome of the disgusting systemic racism in this country #UBERPOOLtoPrisonPipeline",
 '#I BELIEVE IN [NAME]',
 '#YES',
 'no, mostly just spurred from mild boredom and thinking of the [NAME] future. Getting a gauge at the caliber of players chosen at #12.',
 'No thank you, we need to tank, and this moves us 1 gb from the #6 pick :)']

### Contraction mapping

In [105]:
import re

contraction_df = pd.read_csv("../data/mapping/contraction.csv", encoding='utf-8')
contraction_map = {key: value for key, value in zip(contraction_df.contraction, contraction_df.extension)}

contraction_pattern = re.compile(
    r"\b(" + "|".join(map(re.escape, contraction_map.keys())) + r")\b",
    flags=re.IGNORECASE
)

def replace_contraction(match):
    w = match.group(0).lower()
    if w in contraction_map:
        return contraction_map[match.group(0).lower()]
    else:
        return w

def extend(text):
    return re.sub(contraction_pattern, replace_contraction, text)


### Emoji mapping

In [106]:
import numpy as np
import emoji

emoji_df = pd.read_csv('../data/mapping/emoji.csv')
emoji_df['Score'] = np.tanh(
    np.log((emoji_df.Positive + 1) / (emoji_df.Negative + 1))
)

GENDER_EMOJI_MAP = {
    "\u2640": "[EMO_FEMALE]",  # ‚ôÄ
    "\u2642": "[EMO_MALE]",    # ‚ôÇ
    "\u26A7": "[EMO_TRANS]"    # ‚öß
}

def normalize_emoji(e):
    e = re.sub(r"\uFE0F", "", e)
    if e in GENDER_EMOJI_MAP:
        return GENDER_EMOJI_MAP[e]
    e = re.sub(r"\u200d", "", e)
    return e

emoji_df['Emoji_norm'] = emoji_df['Emoji'].apply(normalize_emoji)
emoji_df = emoji_df[emoji_df['Emoji_norm'].str.len() > 0]

emoji_list = sorted(
    emoji_df['Emoji_norm'].unique(),
    key=len,
    reverse=True
)

emoji_map = dict(
    zip(
        emoji_df['Emoji_norm'],
        zip(emoji_df['Unicode name'], emoji_df.Score)
    )
)

In [107]:
def extract_emoji(sentence, beta=1.0):
    emoji_scores = []
    new_text = sentence
    strongest = 0.0

    for e in emoji.emoji_list(sentence):
        em = e['emoji']
        norm_em = normalize_emoji(em)

        if not norm_em.startswith("[EMO_"):
            name, score = emoji_map.get(norm_em, ('[EMO]', 0.0))
            if score != 0.0:
                emoji_scores.append(score)
        else:
            name = norm_em

        new_text = new_text.replace(em, name)

    if emoji_scores:
        strongest = max(emoji_scores, key=lambda s: abs(s))

    return new_text, strongest * beta


### Markdown patterns

In [108]:
patterns = {
    "bold": r"\*\*.+?\*\*",
    "italic_star": r"\*(?!\*)(.+?)(?<!\*)\*",
    "italic_underscore": r"_(.+?)_",
    "bold_italic": r"\*\*\*.+?\*\*\*",
    "strikethrough": r"~~.+?~~",
    "inline_code": r"`.+?`",
    "code_block": r"```[\s\S]+?```",
    "quote": r"^>.+",
    "spoiler": r">!.+!<",
    "link": r"\[.+?\]\(.+?\)"
}

results = {}

for name, pattern in patterns.items():
    mask = train_df["text"].str.contains(pattern, regex=True, na=False)
    results[name] = {
        "exists": mask.any(),
        "count": mask.sum()
    }

results

  mask = train_df["text"].str.contains(pattern, regex=True, na=False)


{'bold': {'exists': np.True_, 'count': np.int64(103)},
 'italic_star': {'exists': np.True_, 'count': np.int64(566)},
 'italic_underscore': {'exists': np.True_, 'count': np.int64(27)},
 'bold_italic': {'exists': np.True_, 'count': np.int64(14)},
 'strikethrough': {'exists': np.True_, 'count': np.int64(42)},
 'inline_code': {'exists': np.True_, 'count': np.int64(1)},
 'code_block': {'exists': np.False_, 'count': np.int64(0)},
 'quote': {'exists': np.True_, 'count': np.int64(508)},
 'spoiler': {'exists': np.True_, 'count': np.int64(18)},
 'link': {'exists': np.False_, 'count': np.int64(0)}}

In [109]:
def extract_markdown(text):
    # spoiler
    text = re.sub(r">!(.+?)!<", r" <spoiler> \1 </spoiler> ", text)

    # bold + italic
    text = re.sub(r"\*\*\*(.+?)\*\*\*", r" <bi> \1 </bi> ", text)

    # bold
    text = re.sub(r"\*\*(.+?)\*\*", r" <b> \1 </b> ", text)

    # italic *
    text = re.sub(r"\*(?!\*)(.+?)(?<!\*)\*", r" <i> \1 </i> ", text)

    # strike
    text = re.sub(r"~~(.+?)~~", r" <s> \1 </s> ", text)

    # quote (line-based)
    text = re.sub(r"^>(.+)", r" <q> \1 </q> ", text, flags=re.M)

    # inline code
    text = re.sub(r"`(.+?)`", r" <code> \1 </code> ", text)

    # triple double quotes
    text = re.sub(r'"""\s*(.+?)\s*"""', r' <quote> \1 </quote> ', text)

    return text

### Mention

In [110]:
def normalize_mention(text):
    return re.sub(r"(?<!\w)@[A-Za-z_][A-Za-z0-9_]{1,30}", '[MENTION]', text)    

### URL

In [111]:
def normalize_url(text):
    return re.sub(r"https?://\S+|www\.\S+", '[URL]', text)

### Time

In [112]:
def normalize_time(text):
    return re.sub(r"\b(?:1[0-2]|0?[1-9]):[0-5][0-9]\s*(?i:am|pm)\b", '[TIME]', text)

### Date

In [113]:
def normalize_date(text):
    # ISO 8601 datetime: 2026-01-07T10:30:00
    text = re.sub(r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\b", "[DATE]", text)
    # YYYY-MM-DD
    text = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "[DATE]", text)
    # MM/DD/YYYY
    text = re.sub(r"\b\d{1,2}/\d{1,2}/\d{4}\b", "[DATE]", text)
    # DD-MM-YYYY
    text = re.sub(r"\b\d{1,2}-\d{1,2}-\d{4}\b", "[DATE]", text)
    # Month Day, Year (Jan 7, 2026 or January 7, 2026)
    text = re.sub(
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|"
        r"January|February|March|April|May|June|July|August|September|October|November|December)"
        r"\s+\d{1,2},\s*\d{4}\b", "[DATE]", text
    )
    # Day Month Year (7 Jan 2026 or 7 January 2026)
    text = re.sub(
        r"\b\d{1,2}\s+"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|"
        r"January|February|March|April|May|June|July|August|September|October|November|December)"
        r"\s+\d{4}\b", "[DATE]", text
    )
    # Compact numeric YYYYMMDD
    text = re.sub(r"\b\d{8}\b", "[DATE]", text)
    
    return text

### Hashtag

In [114]:
def normalize_hashtag(text):

    def repl(m):
        tag = m.group()[1:]
        tag = tag.lower()
        return f"[HASHTAG] {tag}"
    
    return re.sub(r'#\w+', repl, text)

### Whitespace

In [115]:
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

### Lowercase

In [116]:
def lowercase(text):
    token_pattern = r'(\[[A-Z_]+\])'

    parts = re.split(token_pattern, text)

    parts = [p.lower() if not re.fullmatch(token_pattern, p) else p for p in parts]

    return ''.join(parts)

### Punctuation

In [117]:
def normalize_punctuation(text: str) -> str:
    text = re.sub(r'\.{3,}', '...', text)

    text = re.sub(r'!{3,}', '!!', text)
    text = re.sub(r'\?{3,}', '??', text)

    text = re.sub(r'(!\?|\?!){2,}', '!?', text)

    text = re.sub(r'\s+([!?.,])', r'\1', text)
    text = re.sub(r'([!?.,])\s+', r'\1 ', text)

    text = re.sub(r'([,;:]){2,}', r'\1', text)

    return text

### Feature uppercase

In [118]:
def extract_is_all_uppercase(text):
    return text, int(text.isupper())

In [119]:
def extract_uppercase_ratio(text):
    clean_text = re.sub(r'\[[A-Z_]+\]', '', text)
    
    alphas = [c for c in clean_text if c.isalpha()]
    
    if not alphas:
        return text,0.0
    
    return text, sum(c.isupper() for c in alphas) / len(alphas)

### Feature exclaimination

In [120]:
def extract_exclamation_intensity(text, cap=5):
    max_run = 0
    cur = 0
    for c in text:
        if c == '!':
            cur += 1
            max_run = max(max_run, cur)
        else:
            cur = 0

    return text, min(max_run, cap) / cap

### Test

In [121]:
text = "***aAa*** HATE? @abc123 ijiqi aikks....... !!!! üòî this's http://localhost:8080/api https://google.com 2:00 AM 01/01/2026 ??? #NLP ###"

print(extract_exclamation_intensity(text, cap=5))

text = extract_markdown(text)
text = extend(text)
text, score = extract_emoji(text, beta=1.0)
text = normalize_mention(text)
text = normalize_url(text)
text = normalize_time(text)
text = normalize_date(text)
text = normalize_hashtag(text)
text = normalize_whitespace(text)

print(extract_uppercase_ratio(text))

text = lowercase(text)
text = normalize_punctuation(text)
print(text, score)

("***aAa*** HATE? @abc123 ijiqi aikks....... !!!! üòî this's http://localhost:8080/api https://google.com 2:00 AM 01/01/2026 ??? #NLP ###", 0.8)
('<bi> aAa </bi> HATE? [MENTION] ijiqi aikks....... !!!! [EMO_SAD] this is [URL] [URL] [TIME] [DATE] ??? [HASHTAG] nlp ###', 0.16666666666666666)
<bi> aaa </bi> hate? [MENTION] ijiqi aikks...!! [EMO_SAD] this is [URL] [URL] [TIME] [DATE]?? [HASHTAG] nlp ### -0.36035535813436975


### Tokenize

In [122]:
def tokenize(text):
    patterns = [
        r"\[[A-Z_]+\]",
        r"<\/?[\w_]+>",
        r"\w+",
        r"[?!]{2,}",
        r"\.{3,}",
        r"[^\w\s]"
    ]

    combined = re.compile("|".join(patterns), re.UNICODE)

    return combined.findall(text)

### Pipeline order

In [123]:
from functools import partial

preprocessing_pipeline = [
    (partial(extract_exclamation_intensity, cap=5), 'ex_intensity'),
    (extract_markdown),
    (extend),
    (partial(extract_emoji, beta=1.0), 'emoji_score'),
    (normalize_mention),
    (normalize_url),
    (normalize_time),
    (normalize_date),
    (normalize_hashtag),
    (normalize_whitespace),
    (extract_is_all_uppercase, 'all_uppercase'),
    (extract_uppercase_ratio, 'uppercase_ratio'),
    (lowercase),
    (normalize_punctuation)
]

In [124]:
def apply_preprocessing(text, pipeline=preprocessing_pipeline):
    res = {"text": text}

    for process in pipeline:

        if isinstance(process, tuple):
            func, key = process
            res['text'], res[key] = func(res['text'])
        else:
            func = process
            res['text'] = func(res['text'])

    return res

In [125]:
apply_preprocessing("***aAa*** HATE? @abc123 ijiqi aikks....... !!!! üòî this's http://localhost:8080/api https://google.com 2:00 AM 01/01/2026 ??? #NLP")

{'text': '<bi> aaa </bi> hate? [MENTION] ijiqi aikks...!! [EMO_SAD] this is [URL] [URL] [TIME] [DATE]?? [HASHTAG] nlp',
 'ex_intensity': 0.8,
 'emoji_score': -0.36035535813436975,
 'all_uppercase': 0,
 'uppercase_ratio': 0.16666666666666666}

## Apply pipeline

In [126]:
processed_train_df = pd.DataFrame(train_df.text.apply(apply_preprocessing).to_list())
processed_train_df['label'] = train_df.sentiment

In [127]:
processed_val_df = pd.DataFrame(val_df.text.apply(apply_preprocessing).to_list())
processed_val_df['label'] = val_df.sentiment

In [128]:
processed_train_df.to_csv("../data/preprocessed/train.csv", encoding='utf-8', index=False)
processed_val_df.to_csv("../data/preprocessed/val.csv", encoding='utf-8', index=False)