In [110]:
import pandas as pd
import numpy as np
from torch_geometric.data import Data
import torch

In [113]:
from sentence_transformers import SentenceTransformer

# SentenceTransformer creates embeddings for our input text
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [467]:
# Read in training data
df = pd.read_csv('../public_data/train/track_a/eng.csv')
df['text_embedding'] = df['text'].map(lambda x: embedding_model.encode(x))

In [579]:
# Create normalized coccurance matrix (3.1)
emotions_list = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
cooccurance_matrix = [[0] * 5 for i in range(5)]

for _, row in df.iterrows():
    for index, emotion in enumerate(emotions_list):
        if row[emotion] == 1:
            for index2, emotion2 in enumerate(emotions_list):
                cooccurance_matrix[index][index2] += row[emotion2]
normalized_cooccurance_matrix = cooccurance_matrix

for i in range(len(normalized_cooccurance_matrix)):
    k = sum(df[emotions_list[i]])
    for j in range(len(normalized_cooccurance_matrix)):
        normalized_cooccurance_matrix[i][j] /= k

normalized_cooccurance_matrix = torch.tensor(normalized_cooccurance_matrix)
normalized_cooccurance_matrix

tensor([[1.0000, 0.7177, 0.0210, 0.4625, 0.3363],
        [0.1484, 1.0000, 0.0646, 0.4233, 0.3631],
        [0.0104, 0.1543, 1.0000, 0.0697, 0.2329],
        [0.1754, 0.7768, 0.0535, 1.0000, 0.2278],
        [0.1335, 0.6973, 0.1871, 0.2384, 1.0000]])

In [593]:
mu = 0.35
for i in range(5):
    for j in range(5):
        if normalized_cooccurance_matrix[i][j] > mu:
            normalized_cooccurance_matrix[i][j] = 1
        else:
            normalized_cooccurance_matrix[i][j] = 0
normalized_cooccurance_matrix     

tensor([[1., 1., 0., 1., 0.],
        [0., 1., 0., 1., 1.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0.],
        [0., 1., 0., 0., 1.]])

In [594]:
degrees = normalized_cooccurance_matrix.sum(axis=1)

d_inv_sqrt = torch.diag(torch.pow(degrees, -0.5))
qq = d_inv_sqrt @ normalized_cooccurance_matrix @ d_inv_sqrt

In [None]:
emotion_embeddings = torch.tensor([embedding_model.encode(i)for i in ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']])
emotion_embeddings

In [563]:
# Normalize matrix with original gcn method


# w = 0.05
# for i in range(5):
#     for j in range(5):
#         if i != j:
#             normalized_cooccurance_matrix[i][j] /= sum([1 for k in normalized_cooccurance_matrix[i] if k > 0])
#         else:
#             normalized_cooccurance_matrix[i][j] = 1 - w

# adj = torch.tensor(adj_matrix, dtype=torch.float32)
adj = normalized_cooccurance_matrix
# Calculate degree matrix
degrees = adj.sum(dim=1)

# Calculate D^(-1/2)
# Add small epsilon to prevent division by zero
d_inv_sqrt = torch.pow(degrees + 1e-7, -0.5)

# Convert to diagonal matrix
d_inv_sqrt = torch.diag(d_inv_sqrt)

# Normalized adjacency: D^(-1/2) A D^(-1/2)
q = d_inv_sqrt @ adj @ d_inv_sqrt
q
# degree_matrix = [[0] * 5 for i in range(5)]
# for i in range(5):
#     for j in range(5):
#         if i == j:
#             degree_matrix[i][j] = sum([1 for i in normalized_cooccurance_matrix[j] if i > 0])
#         else:
#             degree_matrix[i][j] = 0

# eigenvals, eigenvecs = torch.linalg.eigh(torch.tensor(degree_matrix, dtype=torch.float32))
# M_inv_sqrt = eigenvecs @ torch.diag(1.0 / torch.sqrt(eigenvals)) @ eigenvecs.T
# M_inv_sqrt
# q = M_inv_sqrt @ normalized_cooccurance_matrix @ M_inv_sqrt
# q
# normalized_cooccurance_matrix

tensor([[0.3941, 0.3186, 0.0109, 0.1943, 0.1406],
        [0.0659, 0.5002, 0.0377, 0.2003, 0.1710],
        [0.0054, 0.0901, 0.6815, 0.0385, 0.1280],
        [0.0737, 0.3676, 0.0296, 0.4477, 0.1015],
        [0.0558, 0.3283, 0.1028, 0.1062, 0.4432]])

In [595]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, df, target_columns, feature_column):
        self.df = df.copy()
        self.target_columns = target_columns
        self.feature_column = feature_column

    def __len__(self) -> int:
        return len(self.df) 

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return torch.tensor(row[self.feature_column]), torch.tensor([i for i in row[self.target_columns]], dtype=torch.float32)

In [596]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

In [597]:
dataset = TweetsDataset(df, emotions_list, 'text_embedding')

train_size = int(.7 * len(dataset))
test_size = len(dataset) - train_size

# Create splits
train_dataset, test_dataset = random_split(
    dataset, 
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)  # For reproducibility
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False  # No need to shuffle test data
)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
import math
class EmoGraph(torch.nn.Module):
    def __init__(self, input_dim, m, ee):
        super().__init__()
        self.w1 = torch.nn.Parameter(torch.FloatTensor(384, 384))
        self.w2 = torch.nn.Parameter(torch.FloatTensor(384, 384))

        self.ee = ee

        self.m = m
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.w1.size(1))
        self.w1.data.uniform_(-stdv, stdv)

        stdv = 1. / math.sqrt(self.w2.size(1))
        self.w2.data.uniform_(-stdv, stdv)
    def forward(self, x):
        h = torch.nn.functional.relu(self.m @ self.ee @ self.w1)
        h = torch.nn.functional.relu(self.m @ h @ self.w2)

        logits = torch.matmul(x, h.t())
        out = torch.nn.functional.sigmoid(logits)
        return out

In [None]:
torch.eye(5)

In [None]:
model = EmoGraph(384, qq, emotion_embeddings)

In [None]:
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.001,
    weight_decay=0.01
)

In [None]:
for i in range(100):
    model.train()
    for idx, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        output = model(inputs)
        # print(output, labels)

        loss = loss_fn(output, labels)
        loss.backward()

        optimizer.step()
        print(loss)

In [559]:
thresh = .4
test_df = pd.read_csv('../public_data/dev/track_a/eng_a.csv')
all_results = []

for idx, row in test_df.iterrows():
    e = embedding_model.encode(row['text'])
    pred = model(torch.tensor(e)).tolist()
    res = [1 if k > thresh else 0 for k in pred]
    lol = [row['id']]
    lol.extend(res)
    all_results.append(lol)
    # print(all_results)
    # break
# for i in 
output_df = pd.DataFrame(all_results, columns=['id','Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])
# output_df

In [361]:
# output_df
output_df.to_csv('../outputs/1.csv', index=False)

In [362]:
output_df

Unnamed: 0,id,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track_a_00001,0,1,0,0,0
1,eng_dev_track_a_00002,0,1,0,0,0
2,eng_dev_track_a_00003,0,1,0,0,1
3,eng_dev_track_a_00004,0,1,0,0,0
4,eng_dev_track_a_00005,0,1,0,0,0
...,...,...,...,...,...,...
111,eng_dev_track_a_00112,0,0,0,0,0
112,eng_dev_track_a_00113,0,1,1,0,1
113,eng_dev_track_a_00114,0,1,0,0,0
114,eng_dev_track_a_00115,0,1,0,0,0
