In [421]:
from __future__ import annotations

import json

import collections
import functools as fts
import itertools as its

import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn, optim

from sklearn.model_selection import StratifiedGroupKFold

from project import load_dataset

## Data Processing

### Stratified and grouped training and testing

In [422]:
DATASET = load_dataset("dataset.csv")
AGGREGATED = pd.read_csv("aggregated.csv", index_col=0, converters={"Document": json.loads})

In [423]:
dataset = pd.merge(DATASET, AGGREGATED[["ID", "Document"]], on = "ID", how="left")
DATASET = dataset.dropna()

In [424]:
def get_pos_list(document: dict) -> list[str]:
    return [word["upos"] for sentence in document for word in sentence]

In [425]:
DATASET.loc[:, "POS"] = DATASET["Document"].apply(get_pos_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.loc[:, "POS"] = DATASET["Document"].apply(get_pos_list)


In [426]:
DATASET

Unnamed: 0,ID,Issue,Stance,Argument,Annotator,Argumentative,CO,LA,LR,LS,...,CL,AP,AR,RE,GA,GR,GS,OV,Document,POS
0,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",1,y,1,1,1,1,...,2,1,1,1,1,1,1,1,"[[{'id': 1, 'text': 'it', 'lemma': 'it', 'upos...","[PRON, AUX, ADJ, SCONJ, VERB, NOUN, AUX, DET, ..."
1,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",2,y,1,3,2,1,...,3,2,2,2,3,1,1,1,"[[{'id': 1, 'text': 'it', 'lemma': 'it', 'upos...","[PRON, AUX, ADJ, SCONJ, VERB, NOUN, AUX, DET, ..."
2,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",3,y,2,2,3,2,...,2,2,2,2,2,2,2,2,"[[{'id': 1, 'text': 'it', 'lemma': 'it', 'upos...","[PRON, AUX, ADJ, SCONJ, VERB, NOUN, AUX, DET, ..."
3,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy,Most Americans on average recycle 86-88% of th...,1,y,2,3,3,2,...,2,2,2,2,2,3,2,2,"[[{'id': 1, 'text': 'Most', 'lemma': 'most', '...","[ADJ, PROPN, ADP, NOUN, VERB, NUM, SYM, NUM, S..."
4,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy,Most Americans on average recycle 86-88% of th...,2,y,1,2,2,1,...,2,1,2,1,2,1,1,1,"[[{'id': 1, 'text': 'Most', 'lemma': 'most', '...","[ADJ, PROPN, ADP, NOUN, VERB, NUM, SYM, NUM, S..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,Raffles neglected Singapore when he went aroun...,2,y,1,2,2,1,...,2,3,2,2,3,2,1,2,"[[{'id': 1, 'text': 'Raffles', 'lemma': 'Raffl...","[PROPN, VERB, PROPN, ADV, PRON, VERB, SCONJ, V..."
930,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,Raffles neglected Singapore when he went aroun...,3,y,2,2,3,2,...,2,2,2,2,2,3,2,2,"[[{'id': 1, 'text': 'Raffles', 'lemma': 'Raffl...","[PROPN, VERB, PROPN, ADV, PRON, VERB, SCONJ, V..."
931,arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,"Raffles doesn't care about the citizens, doesn...",1,y,2,2,3,2,...,1,2,2,2,2,3,2,2,"[[{'id': 1, 'text': 'Raffles', 'lemma': 'Raffl...","[PROPN, AUX, PART, VERB, ADP, DET, NOUN, PUNCT..."
932,arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,"Raffles doesn't care about the citizens, doesn...",2,y,1,2,2,1,...,3,3,2,2,2,2,1,2,"[[{'id': 1, 'text': 'Raffles', 'lemma': 'Raffl...","[PROPN, AUX, PART, VERB, ADP, DET, NOUN, PUNCT..."


In [427]:
identifier = 2**DATASET['CO'] * 3**DATASET['EF'] * 5**DATASET['RE']
train, test = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=300).split(DATASET, identifier, DATASET["ID"]).__next__()

TRAIN = DATASET.iloc[train].reset_index(drop=True)
TEST = DATASET.iloc[test].reset_index(drop=True)



### Building the Vocabulary and the Vectorizer

In [428]:
stopwords = nltk.corpus.stopwords.words("english")

In [429]:
corpus = set(DATASET["Argument"])
tokenizer = nltk.tokenize.casual.TweetTokenizer()
tokens = tokenizer.tokenize(" ".join(corpus).lower())

tokens = [token for token in tokens if token not in stopwords]

len(tokens)

11000

In [430]:
corpus_stance = set(DATASET['Stance'])
    
tokens_stance = tokenizer.tokenize(" ".join(i.replace("-", " ") for i in corpus_stance).lower())

tokens_stance = [token for token in tokens_stance if token not in stopwords]

tokens.extend(tokens_stance)


In [431]:
len(tokens)

11051

In [432]:
vocab = sorted({token for token, freq in collections.Counter(tokens).most_common(500)})
vocab = dict(enumerate(vocab, start=2))
vocab[0] = "<pad>"
vocab[1] = "<unk>"
vocab[2] = "ADJ"
vocab[3] = "ADV"
vocab[4] = "INTJ"
vocab[5] = "NOUN"
vocab[6] = "PROPN"
vocab[7] = "VERB"
vocab[8] = "ADP"
vocab[9] = "AUX"
vocab[10] = "CCONJ"
vocab[11] = "DET"
vocab[12] = "NUM"
vocab[13] = "PART"
vocab[14] = "PRON"
vocab[15] = "SCONJ"
vocab[16] = "PUNCT"
vocab[17] = "SYM"
vocab[18] = "X"

vocab = {tok: idx for idx, tok in vocab.items()}

In [433]:
def vectorize(tokens: list[str]) -> list[int]:
    return [vocab.get(token, 1) for token in tokens]

### Building the PyTorch Dataset

In [434]:
from torch.utils.data import Dataset, DataLoader

In [435]:
class QualityDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, dimension: list) -> None:
        self.dataset = [] # (ids: list[int], label: list)
        
        for _, instance in dataframe.iterrows():
            tokens = tokenizer.tokenize(instance["Argument"])
            ids = vectorize(tokens)+[0]
            tokens_stance = tokenizer.tokenize(instance["Stance"])
            ids_stance = vectorize(tokens_stance)
            pos = vectorize((instance["POS"]))
            label = []
            for i in range(len(dimension)):
                label.append(instance[dimension[i]] - 1)
            
            self.dataset.append((torch.cat((torch.tensor(ids, dtype=torch.int, device="cuda"), torch.tensor(ids_stance, dtype=torch.int, device="cuda"), 
                                            torch.tensor(pos, dtype=torch.int, device="cuda"))), label))
            
    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, index: int) -> tuple[torch.tensor, int]:
        return self.dataset[index]

In [436]:
TRAIN_DATASET = QualityDataset(TRAIN, ["OV", "CO", "EF", "RE"])
TEST_DATASET  = QualityDataset(TEST, ["OV", "CO", "EF", "RE"])

In [437]:
#for X, Y in TRAIN_DATASET:
    #print(X)
    #print(Y)

## Modeling

In [438]:
class BaselineGRU(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, gru_hidden_dim: int) -> None:
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.gru_hidden_dim = gru_hidden_dim
        
        self.embedder = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.encoder = nn.GRU(embedding_dim, gru_hidden_dim, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(gru_hidden_dim * 2, 3)
        self.fc2 = nn.Linear(gru_hidden_dim * 2, 3)
        self.fc3 = nn.Linear(gru_hidden_dim * 2, 3)
        self.fc4 = nn.Linear(gru_hidden_dim * 2, 3)
        
    def forward(self, x: torch.tensor, softmax: bool = False) -> torch.tensor:
        embeddings = self.embedder(x)
        _, outputs = self.encoder(embeddings)
        logits1 = self.fc1(outputs.view(-1, self.gru_hidden_dim * 2))
        logits2 = self.fc2(outputs.view(-1, self.gru_hidden_dim * 2))
        logits3 = self.fc3(outputs.view(-1, self.gru_hidden_dim * 2))
        logits4 = self.fc4(outputs.view(-1, self.gru_hidden_dim * 2))
        
        if len(x.shape) == 1:
            logits1 = logits1.squeeze()
            logits2 = logits2.squeeze()
            logits3 = logits3.squeeze()
            logits4 = logits4.squeeze()
        
        
        return (logits1.softmax(axis=-1), logits2.softmax(axis=-1), 
               logits3.softmax(axis=-1), logits4.softmax(axis=-1)) if softmax else (
            logits1, logits2, logits3, logits4) 

In [443]:
model = BaselineGRU(len(vocab), 5, 4).to("cuda")
model

BaselineGRU(
  (embedder): Embedding(502, 5, padding_idx=0)
  (encoder): GRU(5, 4, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=8, out_features=3, bias=True)
  (fc2): Linear(in_features=8, out_features=3, bias=True)
  (fc3): Linear(in_features=8, out_features=3, bias=True)
  (fc4): Linear(in_features=8, out_features=3, bias=True)
)

In [444]:
lr = 1e-2
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [445]:
def train_model(model, loss, optimizer, n_epochs = 100):
    # initialize tracker for minimum test loss
    test_loss_min = np.Inf
    for epoch in range(1, n_epochs):
        train_loss = 0.0
        test_loss = 0.0
        
        # training stage
        model.train()
        for X, Y in TRAIN_DATASET:
            P = model(X, softmax=False)
            
            loss1 = loss(P[0].unsqueeze(dim=0), torch.tensor([Y[0]], dtype=torch.long, device="cuda"))
            loss2 = loss(P[1].unsqueeze(dim=0), torch.tensor([Y[1]], dtype=torch.long, device="cuda"))
            loss3 = loss(P[2].unsqueeze(dim=0), torch.tensor([Y[2]], dtype=torch.long, device="cuda"))
            loss4 = loss(P[3].unsqueeze(dim=0), torch.tensor([Y[3]], dtype=torch.long, device="cuda"))
            
            loss_total = loss1 + loss2 + loss3 + loss4
            
            train_loss += loss_total.detach().item()
            
            loss_total.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        train_loss /= len(TRAIN_DATASET)
        
        model.eval()
        
        accuracy1 = 0
        accuracy2 = 0
        accuracy3 = 0
        accuracy4 = 0
        
        for X, Y in TEST_DATASET:
            P = model(X, softmax=False)
            
            loss1 = loss(P[0].unsqueeze(dim=0), torch.tensor([Y[0]], dtype=torch.long, device="cuda"))
            loss2 = loss(P[1].unsqueeze(dim=0), torch.tensor([Y[1]], dtype=torch.long, device="cuda"))
            loss3 = loss(P[2].unsqueeze(dim=0), torch.tensor([Y[2]], dtype=torch.long, device="cuda"))
            loss4 = loss(P[3].unsqueeze(dim=0), torch.tensor([Y[3]], dtype=torch.long, device="cuda"))
            
            test_loss += loss1.detach().item() + loss2.detach().item() + loss3.detach().item() + loss4.detach().item()
            
            accuracy1 += 1 if P[0].softmax(axis=-1).argmax().detach().item() == Y[0] else 0
            accuracy2 += 1 if P[0].softmax(axis=-1).argmax().detach().item() == Y[1] else 0
            accuracy3 += 1 if P[0].softmax(axis=-1).argmax().detach().item() == Y[2] else 0
            accuracy4 += 1 if P[0].softmax(axis=-1).argmax().detach().item() == Y[3] else 0
            
        
        test_loss /= len(TEST_DATASET)
        accuracy1 /= len(TEST_DATASET)
        accuracy2 /= len(TEST_DATASET)
        accuracy3 /= len(TEST_DATASET)
        accuracy4 /= len(TEST_DATASET)
        
        
        
        if epoch % 5 == 0:
            print(f"{epoch:5d} - {train_loss:8.6f} {test_loss:8.6f} {accuracy1:10.6%} {accuracy2:10.6%} {accuracy3:10.6%} {accuracy4:10.6%}")

        if test_loss < test_loss_min:
            torch.save(model, 'model.pt')
            print('Testing loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            test_loss_min,
            test_loss))
            test_loss_min = test_loss
    # return trained model
    return model    
                                                          

In [446]:
model_GRU = train_model(model, loss_fn, optimizer, 100)

Testing loss decreased (inf --> 3.972430).  Saving model ...
Testing loss decreased (3.972430 --> 3.906049).  Saving model ...
Testing loss decreased (3.906049 --> 3.850346).  Saving model ...
Testing loss decreased (3.850346 --> 3.803603).  Saving model ...
    5 - 3.904187 3.764029 51.366120% 52.459016% 49.180328% 47.540984%
Testing loss decreased (3.803603 --> 3.764029).  Saving model ...
Testing loss decreased (3.764029 --> 3.730601).  Saving model ...
Testing loss decreased (3.730601 --> 3.702734).  Saving model ...
Testing loss decreased (3.702734 --> 3.680078).  Saving model ...
Testing loss decreased (3.680078 --> 3.662392).  Saving model ...
   10 - 3.801156 3.649312 55.191257% 56.830601% 60.109290% 49.726776%
Testing loss decreased (3.662392 --> 3.649312).  Saving model ...
Testing loss decreased (3.649312 --> 3.640173).  Saving model ...
Testing loss decreased (3.640173 --> 3.633980).  Saving model ...
Testing loss decreased (3.633980 --> 3.629569).  Saving model ...
Testing

KeyboardInterrupt: 