In [128]:
import numpy as np
import math
import json
import pandas as pd

In [129]:
path = './data/final_posts.json'

with open(path, 'r', encoding='utf-8') as file:
    data = json.load(file)

item_to_flatten = [record['root']['_source']['post'] for record in data]
main_df = pd.json_normalize(item_to_flatten)
main_df

Unnamed: 0,post_id,text,hashtags
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[]
1,0x35663e,I bet there is an army of married couples who ...,[]
2,0xc78afe,This could only end badly.,[]
3,0x90089c,My sister squeezed a lime in her milk when she...,[]
4,0xaba820,and that got my head bobbing a little bit.,[]
...,...,...,...
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[]
64167,0xf5ba78,One of my favorite episodes.,[]
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[]
64169,0xb5a35a,Texans and Astros both shut out tonight. Houst...,"[texans, astros, sadness, losers]"


In [130]:
df_emo = pd.read_csv('./data/emotion.csv')
df_split = pd.read_csv('./data/data_identification.csv')

df_emo.head()

Unnamed: 0,post_id,emotion
0,0x35663e,joy
1,0xc78afe,fear
2,0x90089c,joy
3,0x2ffb63,joy
4,0x989146,joy


In [131]:
# Add emotion & train/test set split labels
df_tmp = pd.merge(main_df, df_emo, how='left', on='post_id')
df = pd.merge(df_tmp, df_split, how='left', on='post_id')
df.head()

Unnamed: 0,post_id,text,hashtags,emotion,split
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
4,0xaba820,and that got my head bobbing a little bit.,[],,test


In [132]:
# Split the data into train and test datasets
model_df = df[df['split'] == 'train'].copy()
test_df = df[df['split'] == 'test'].copy()

print(sum(test_df['emotion'].value_counts()))
model_df

0


Unnamed: 0,post_id,text,hashtags,emotion,split
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train
...,...,...,...,...,...
64164,0xd740f2,why is everybody seem sp serious?,[],joy,train
64165,0x99267e,"You can cross fuck off, its 10f all winter in ...",[],anger,train
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train


In [133]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import emoji

In [134]:
# Bi-direction mapping for labels
emo_to_int = {'joy': 0, 'surprise': 1, 'anger': 2, 'sadness': 3, 'fear': 4, 'disgust': 5}
int_to_emo = {0: 'joy', 1: 'surpirse', 2: 'anger', 3: 'sadness', 4: 'fear', 5:'disgust'}

In [135]:
# Define necessary functions
class Dataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        x = torch.tensor(self.features[index], dtype=torch.float32)
        y = torch.tensor(self.labels[index], dtype=torch.long)
        return x, y

def collate_batch(batch):
    features, labels = zip(*batch)
    dense_features = torch.stack(features)
    dense_labels = torch.stack(labels)

    return dense_features, dense_labels

In [144]:
model_df['simple_text'] = model_df['text'].apply(lambda t: emoji.demojize(t))
model_df['label'] = model_df['emotion'].apply(lambda x: emo_to_int[x])

# Train/Test split
x_train, x_val, y_train, y_val = train_test_split(
    model_df['simple_text'],
    model_df['label'],
    test_size=0.3,
    random_state=42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
train_features = vectorizer.fit_transform(x_train).toarray()
val_features = vectorizer.transform(x_val).toarray()

In [170]:
# Hyperparameters
batch_size = 256
epochs = 10
learning_rate = 0.002

In [145]:
ds_train = Dataset(train_features, y_train.values)
dl_train = DataLoader(dataset=ds_train, batch_size=batch_size, collate_fn=collate_batch)
ds_test = Dataset(val_features, y_val.values)
dl_test = DataLoader(dataset=ds_test, batch_size=batch_size, collate_fn=collate_batch)

In [157]:
class FeedForward(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim):
        super(FeedForward, self).__init__()

        # Input Layer
        self.ff1 = nn.Linear(in_features=input_dim,
                             out_features=hidden_dim1)
        self.re1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.5)

        # Hidden Layer 1
        self.ff2 = nn.Linear(in_features=hidden_dim1,
                             out_features=hidden_dim2)
        self.re2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.3)

        self.ff3 = nn.Linear(in_features=hidden_dim2,
                             out_features=hidden_dim3)
        self.re3 = nn.ReLU()

        # Output Layer
        self.ff4 = nn.Linear(in_features=hidden_dim3, out_features=output_dim)

    def forward(self, x):
        inter = self.ff1(x)
        inter = self.re1(inter)
        inter = self.drop1(inter)

        inter = self.ff2(inter)
        inter = self.re2(inter)
        inter = self.drop2(inter)

        inter = self.ff3(inter)
        inter = self.re3(inter)

        y = self.ff4(inter)

        return y

In [171]:
from sklearn.utils.class_weight import compute_class_weight

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

input_dim = train_features.shape[1]
hidden_dim1 = 1000
hidden_dim2 = 128
hidden_dim3 = 32
output_dim = 6

# Optional weight penalty
label_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train.values),
    y=y_train.values
)
weight_tensors = torch.tensor(label_weights, dtype=torch.float32).to(device)

model = FeedForward(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

In [172]:
from sklearn.metrics import f1_score

# Start training
i = 0
for epoch in range(epochs):
    model.train()
    progress = tqdm(dl_train, desc=f"Training epoch {epoch+1}")

    for batch_features, batch_labels in progress:
        # Get model output
        output_y = model(batch_features.to(device))

        # Calculate loss
        loss = criterion(output_y.to(device), batch_labels.to(device))

        # Update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        i += 1
        if i % 20 == 0:
            progress.set_postfix({'loss': f"{loss.item(): .3f}"})

    model.eval()
    PRED_LABEL = list()
    TRUE_LABEL = list()
    evaluation = tqdm(dl_test, desc=f"Validation phase")

    with torch.no_grad():
        for batch_x, batch_y in evaluation:
            pred_y = model(batch_x.to(device))
            pred_labels = torch.argmax(pred_y, dim=1)
            TRUE_LABEL.extend(batch_y.cpu().numpy())
            PRED_LABEL.extend(pred_labels.cpu().numpy())

        f1 = f1_score(TRUE_LABEL, PRED_LABEL, average='weighted')
        print(f"F1-Score (Epoch {epoch+1}):\n{f1}\n")

Training epoch 1: 100%|██████████| 131/131 [00:00<00:00, 173.28it/s, loss=1.235]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 374.48it/s]


F1-Score (Epoch 1):
0.47355311497730246



Training epoch 2: 100%|██████████| 131/131 [00:00<00:00, 259.86it/s, loss=1.194]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 380.94it/s]


F1-Score (Epoch 2):
0.5347840397476398



Training epoch 3: 100%|██████████| 131/131 [00:00<00:00, 263.09it/s, loss=0.844]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 370.63it/s]


F1-Score (Epoch 3):
0.5640640721231066



Training epoch 4: 100%|██████████| 131/131 [00:00<00:00, 258.72it/s, loss=0.835]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 361.99it/s]


F1-Score (Epoch 4):
0.564654544490492



Training epoch 5: 100%|██████████| 131/131 [00:00<00:00, 256.28it/s, loss=0.626]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 362.54it/s]


F1-Score (Epoch 5):
0.5565093330099304



Training epoch 6: 100%|██████████| 131/131 [00:00<00:00, 248.33it/s, loss=0.499]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 343.70it/s]


F1-Score (Epoch 6):
0.5554179927583391



Training epoch 7: 100%|██████████| 131/131 [00:00<00:00, 244.41it/s, loss=0.420]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 363.28it/s]


F1-Score (Epoch 7):
0.5485397405451209



Training epoch 8: 100%|██████████| 131/131 [00:00<00:00, 245.88it/s, loss=0.390]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 362.96it/s]


F1-Score (Epoch 8):
0.5493713891034754



Training epoch 9: 100%|██████████| 131/131 [00:00<00:00, 242.17it/s, loss=0.332]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 361.21it/s]


F1-Score (Epoch 9):
0.5509086746354449



Training epoch 10: 100%|██████████| 131/131 [00:00<00:00, 240.66it/s, loss=0.137]
Validation phase: 100%|██████████| 57/57 [00:00<00:00, 361.14it/s]

F1-Score (Epoch 10):
0.5487619314547532






In [173]:
# Prediction for test data
test_df['simple_text'] = test_df['text'].apply(lambda t: emoji.demojize(t))

test_features = vectorizer.transform(test_df['simple_text']).toarray()
test_features = torch.tensor(test_features, dtype=torch.float32)

final_preds = list()
test_batch = 32

model.eval()
with torch.no_grad():
    for i in range(int(math.ceil(len(test_df)/test_batch))):
        start_bacth = i * test_batch
        end_bacth = (i + 1) * test_batch
        x_test = test_features[start_bacth:end_bacth]

        y_test = model(x_test.to(device))
        pred_test = torch.argmax(y_test, dim=1)
        final_preds.extend(pred_test.cpu().numpy())

In [174]:
test_df['label'] = np.array(final_preds)
test_df['emotion'] = test_df['label'].apply(lambda x: int_to_emo[x])
test_df.rename(columns={'post_id': 'id'}, inplace=True)

final_df = test_df[['id', 'emotion']]
final_df.to_csv('./data/result.csv', index=False)
final_df

Unnamed: 0,id,emotion
0,0x61fc95,anger
4,0xaba820,joy
5,0x66e44d,joy
6,0xc03cf5,joy
8,0x02f65a,joy
...,...,...
64146,0x0f273c,joy
64150,0xfc4c5d,anger
64157,0xb318a3,anger
64168,0x8f758e,anger
