In [1]:
import numpy as np
import math
import json
import pandas as pd

In [2]:
path = './data/final_posts.json'

with open(path, 'r', encoding='utf-8') as file:
    data = json.load(file)

item_to_flatten = [record['root']['_source']['post'] for record in data]
main_df = pd.json_normalize(item_to_flatten)
main_df

Unnamed: 0,post_id,text,hashtags
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[]
1,0x35663e,I bet there is an army of married couples who ...,[]
2,0xc78afe,This could only end badly.,[]
3,0x90089c,My sister squeezed a lime in her milk when she...,[]
4,0xaba820,and that got my head bobbing a little bit.,[]
...,...,...,...
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[]
64167,0xf5ba78,One of my favorite episodes.,[]
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[]
64169,0xb5a35a,Texans and Astros both shut out tonight. Houst...,"[texans, astros, sadness, losers]"


In [3]:
df_emo = pd.read_csv('./data/emotion.csv')
df_split = pd.read_csv('./data/data_identification.csv')

df_emo.head()

Unnamed: 0,post_id,emotion
0,0x35663e,joy
1,0xc78afe,fear
2,0x90089c,joy
3,0x2ffb63,joy
4,0x989146,joy


In [4]:
# Add emotion & train/test set split labels
df_tmp = pd.merge(main_df, df_emo, how='left', on='post_id')
df = pd.merge(df_tmp, df_split, how='left', on='post_id')
df.head()

Unnamed: 0,post_id,text,hashtags,emotion,split
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
4,0xaba820,and that got my head bobbing a little bit.,[],,test


In [5]:
# Split the data into train and test datasets
model_df = df[df['split'] == 'train'].copy()
test_df = df[df['split'] == 'test'].copy()

print(sum(test_df['emotion'].value_counts()))
model_df

0


Unnamed: 0,post_id,text,hashtags,emotion,split
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train
...,...,...,...,...,...
64164,0xd740f2,why is everybody seem sp serious?,[],joy,train
64165,0x99267e,"You can cross fuck off, its 10f all winter in ...",[],anger,train
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train


In [6]:
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types

env_path = "./config/.env"
load_dotenv(dotenv_path=env_path)

SYSTEM_PROMPT = (
        '''You are a helpful assistant. You give a response by strictly following the instructions, a little creativity is accepted. The task is to generate the previous and the next possible sentence for each Twitter comment in the input list
        to extend the contents of the input, combine them with the original comment, and return all of the augmented texts as a list of strings. The contents and the sentiments of both the virtual(generated) texts need to be the same as or similar to 
        that of the given input. Take special cases, e.g. analogy, metaphor, and sarcasm, into account. The augmented output should be keep as short as possible.'''
    )

MAX_OUTPUT_TOKENS = 65535
MODEL_NAME = 'gemini-2.5-flash'

SAFETY_SETTINGS = [
    types.SafetySetting(
        category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
]

api_key = os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = api_key
client = genai.Client(api_key=api_key)

if 'GEMINI_API_KEY' not in os.environ:
    os.environ['GEMINI_API_KEY'] = api_key

schema = types.Schema(
     type=types.Type.ARRAY,
     items=types.Schema(
          type=types.Type.STRING
     )
)

def gemini_api(
        input_prompt: list[str],
        schema = schema,
        temperature: float = 0.2,
        system_instruction: str = SYSTEM_PROMPT,
        max_output_tokens: int = MAX_OUTPUT_TOKENS,
        client: genai.Client = client,
        model_name: str = MODEL_NAME,
        new_config: types.GenerateContentConfig = None,
        with_tools: bool = False,
        with_parts: bool = False,
        with_tokens_info: bool = False
    ):
        try:
            if schema:
                generate_content_config = types.GenerateContentConfig(
                    temperature=temperature,
                    system_instruction=system_instruction,
                    max_output_tokens=max_output_tokens,
                    response_modalities=["TEXT"],
                    response_mime_type="application/json",
                    response_schema=schema,
                    safety_settings=SAFETY_SETTINGS
                )
            else:
                generate_content_config = types.GenerateContentConfig(
                    temperature=temperature,
                    system_instruction=system_instruction,
                    max_output_tokens=max_output_tokens,
                    response_modalities=["TEXT"],
                    safety_settings=SAFETY_SETTINGS
                )
            
            if new_config:
                generate_content_config = new_config
            
            if with_parts:
                response = client.models.generate_content(
                    model=model_name,
                    contents=types.Content(parts=input_prompt),
                    config=generate_content_config,
                )
            else:
                response = client.models.generate_content(
                    model=model_name,
                    contents=input_prompt,
                    config=generate_content_config,
                )

            completion = response.text
            if with_tokens_info:
                log = {
                    "model": model_name,
                    "input_tokens": response.usage_metadata.prompt_token_count,
                    "output_tokens": response.usage_metadata.candidates_token_count,
                }
                return completion, log
            return completion

        except Exception as e:
             print(f"Error occurred when generating response, error: {e}")
             return None

In [7]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import emoji

In [8]:
# Bi-direction mapping for labels
emo_to_int = {'joy': 0, 'surprise': 1, 'anger': 2, 'sadness': 3, 'fear': 4, 'disgust': 5}
int_to_emo = {0: 'joy', 1: 'surpirse', 2: 'anger', 3: 'sadness', 4: 'fear', 5:'disgust'}

model_df['simple_text'] = model_df['text'].apply(lambda t: emoji.demojize(t))
model_df['label'] = model_df['emotion'].apply(lambda x: emo_to_int[x])

# Generate virtual augmented text data (failed attempt)
# in_prompt = model_df['simple_text'].tolist()
# resp = gemini_api(input_prompt=in_prompt)
# model_df['augmented_text'] = json.loads(resp)

# Train/Test split
x_train, x_val, y_train, y_val = train_test_split(
    model_df['simple_text'],
    model_df['label'],
    test_size=0.3,
    random_state=42
)

df_train = pd.concat([x_train, y_train], axis=1)
df_train.reset_index(inplace=True)
df_val = pd.concat([x_val, y_val], axis=1)
df_val.reset_index(inplace=True)

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', cache_dir='./cache/')

# Define necessary functions
class Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encodes = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding=False,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encodes['input_ids'].flatten(),
            'attention_mask': encodes['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def collate_batch(batch):
    input_ids = [data['input_ids'] for data in batch]
    attention_masks = [data['attention_mask'] for data in batch]
    labels = [data['labels'] for data in batch]

    input_id_padded = torch.nn.utils.rnn.pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_padded = torch.nn.utils.rnn.pad_sequence(
        attention_masks, batch_first=True, padding_value=0
    )
    label_stacked = torch.stack(labels)

    return {
        'input_ids': input_id_padded,
        'attention_mask': attention_mask_padded,
        'labels': label_stacked
    }

In [17]:
# Hyperparameters
batch_size = 128
epochs = 4
learning_rate = 2e-5

ds_train = Dataset(df_train['simple_text'].tolist(), df_train['label'].tolist(), tokenizer=tokenizer)
dl_train = DataLoader(dataset=ds_train, batch_size=batch_size, num_workers=0, collate_fn=collate_batch)
ds_val = Dataset(df_val['simple_text'].tolist(), df_val['label'].tolist(), tokenizer=tokenizer)
dl_val = DataLoader(dataset=ds_val, batch_size=batch_size, num_workers=0, collate_fn=collate_batch)

In [12]:
class ModelRoberta(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.robert = RobertaModel.from_pretrained('roberta-base')
        self.dense = nn.Sequential(
            nn.Linear(in_features=768, out_features=128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(in_features=128, out_features=6)
        )
    
    def forward(self, input_ids, attention_mask):
        y = self.robert(input_ids=input_ids, attention_mask=attention_mask)
        y_hidden = y.last_hidden_state[:, 0, :]
        logits = self.dense(y_hidden)

        return logits

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = ModelRoberta()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(params=model.parameters(), lr=learning_rate, weight_decay=0.01)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from sklearn.metrics import f1_score

# Start training
i = 0
for epoch in range(epochs):
    model.train()
    progress = tqdm(dl_train, desc=f"Training epoch {epoch+1}")

    for batch in progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Get model output
        output_y = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate loss
        loss = criterion(output_y, labels)

        # Update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        i += 1
        if i % 50 == 0:
            progress.set_postfix({'loss': f"{loss.item(): .3f}"})

    model.eval()
    PRED_LABEL = list()
    TRUE_LABEL = list()
    evaluation = tqdm(dl_val, desc=f"Validation phase")

    with torch.no_grad():
        for batch in evaluation:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            pred_y = model(input_ids=input_ids, attention_mask=attention_mask)
            pred_labels = torch.argmax(pred_y, dim=1)
            TRUE_LABEL.extend(labels.cpu().numpy())
            PRED_LABEL.extend(pred_labels.detach().cpu().numpy())

        f1 = f1_score(TRUE_LABEL, PRED_LABEL, average='weighted')
        print(f"F1-Score (Epoch {epoch+1}):\n{f1}\n")

Training epoch 1: 100%|██████████| 262/262 [05:05<00:00,  1.17s/it, loss=0.796]
Validation phase: 100%|██████████| 113/113 [00:53<00:00,  2.11it/s]


F1-Score (Epoch 1):
0.6578333269356351



Training epoch 2: 100%|██████████| 262/262 [05:20<00:00,  1.22s/it, loss=0.746]
Validation phase: 100%|██████████| 113/113 [00:58<00:00,  1.95it/s]


F1-Score (Epoch 2):
0.6768496085318025



Training epoch 3: 100%|██████████| 262/262 [03:47<00:00,  1.15it/s, loss=0.761]
Validation phase: 100%|██████████| 113/113 [01:13<00:00,  1.53it/s]

F1-Score (Epoch 3):
0.6808820402205179






In [15]:
# Prediction for test data
test_df['simple_text'] = test_df['text'].apply(lambda t: emoji.demojize(t))
test_raw = test_df['simple_text'].tolist()
final_preds = list()
test_batch = 32

model.eval()
with torch.no_grad():
    for i in range(int(math.ceil(len(test_raw)/test_batch))):
        start_bacth = i * test_batch
        end_bacth = (i + 1) * test_batch
        x_test = tokenizer(
            test_raw[start_bacth:end_bacth],
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )

        x_ids = x_test['input_ids'].to(device)
        x_masks = x_test['attention_mask'].to(device)
        y_test = model(input_ids=x_ids, attention_mask=x_masks)
        pred_test = torch.argmax(y_test, dim=1)
        final_preds.extend(pred_test.detach().cpu().numpy())

In [16]:
test_df['label'] = np.array(final_preds)
test_df['emotion'] = test_df['label'].apply(lambda x: int_to_emo[x])
test_df.rename(columns={'post_id': 'id'}, inplace=True)

final_df = test_df[['id', 'emotion']]
final_df.to_csv('./data/result.csv', index=False)
final_df

Unnamed: 0,id,emotion
0,0x61fc95,fear
4,0xaba820,fear
5,0x66e44d,joy
6,0xc03cf5,joy
8,0x02f65a,anger
...,...,...
64146,0x0f273c,joy
64150,0xfc4c5d,sadness
64157,0xb318a3,sadness
64168,0x8f758e,fear
