In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments


In [2]:
%store -r data
%store -r df

In [3]:
data

Unnamed: 0,Argument_ID,processed_text
0,A01002,we should ban human cloning as it will only ca...
1,A01005,fast food should be banned because it is reall...
2,A01006,sometimes economic sanctions are the only thin...
3,A01007,capital punishment is sometimes the only optio...
4,A01008,factory farming allows for the production of c...
...,...,...
5387,E08016,on the one hand we have russia killing countle...
5388,E08017,the subsidies were originally intended to ensu...
5389,E08018,these products come mainly from large enterpri...
5390,E08019,subsidies often make farmers in recipient coun...


In [4]:
df

Unnamed: 0,Argument ID,Openness to change,Self-Enhancement,Conservation,Self-Transcendence,labels
0,A01002,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
1,A01005,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
2,A01006,0,1,1,0,"[0.0, 0.5, 0.5, 0.0]"
3,A01007,0,0,2,1,"[0.0, 0.0, 0.6666666666666666, 0.3333333333333..."
4,A01008,0,0,1,2,"[0.0, 0.0, 0.3333333333333333, 0.6666666666666..."
...,...,...,...,...,...,...
5383,E08005,0,1,2,1,"[0.0, 0.25, 0.5, 0.25]"
5384,E08008,0,1,1,1,"[0.0, 0.3333333333333333, 0.3333333333333333, ..."
5385,E08009,0,1,1,0,"[0.0, 0.5, 0.5, 0.0]"
5387,E08016,0,2,1,1,"[0.0, 0.5, 0.25, 0.25]"


In [5]:
df = df.rename(columns={'Argument ID': 'Argument_ID'})


In [6]:
# Merge based on the 'ID' column
df_combined = pd.merge(data, df, on='Argument_ID', how='inner')

print(df_combined)

     Argument_ID                                     processed_text  \
0         A01002  we should ban human cloning as it will only ca...   
1         A01005  fast food should be banned because it is reall...   
2         A01006  sometimes economic sanctions are the only thin...   
3         A01007  capital punishment is sometimes the only optio...   
4         A01008  factory farming allows for the production of c...   
...          ...                                                ...   
2765      E08005  absolutely right europe needs an army for cont...   
2766      E08008  in principle i would be in favor of a european...   
2767      E08009  i agree with your idea but it is a long and ar...   
2768      E08016  on the one hand we have russia killing countle...   
2769      E08020  the eu can not endlessly lean on america or na...   

      Openness to change  Self-Enhancement  Conservation  Self-Transcendence  \
0                      0                 0             1           

In [7]:
df_combined.head()

Unnamed: 0,Argument_ID,processed_text,Openness to change,Self-Enhancement,Conservation,Self-Transcendence,labels
0,A01002,we should ban human cloning as it will only ca...,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
1,A01005,fast food should be banned because it is reall...,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
2,A01006,sometimes economic sanctions are the only thin...,0,1,1,0,"[0.0, 0.5, 0.5, 0.0]"
3,A01007,capital punishment is sometimes the only optio...,0,0,2,1,"[0.0, 0.0, 0.6666666666666666, 0.3333333333333..."
4,A01008,factory farming allows for the production of c...,0,0,1,2,"[0.0, 0.0, 0.3333333333333333, 0.6666666666666..."


In [8]:
df_combined["labels"]

0                                    [0.0, 0.0, 1.0, 0.0]
1                                    [0.0, 0.0, 1.0, 0.0]
2                                    [0.0, 0.5, 0.5, 0.0]
3       [0.0, 0.0, 0.6666666666666666, 0.3333333333333...
4       [0.0, 0.0, 0.3333333333333333, 0.6666666666666...
                              ...                        
2765                               [0.0, 0.25, 0.5, 0.25]
2766    [0.0, 0.3333333333333333, 0.3333333333333333, ...
2767                                 [0.0, 0.5, 0.5, 0.0]
2768                               [0.0, 0.5, 0.25, 0.25]
2769                             [0.25, 0.25, 0.25, 0.25]
Name: labels, Length: 2770, dtype: object

In [11]:
import torch

In [17]:
df_combined.head()

Unnamed: 0,Argument_ID,processed_text,input_ids,attention_mask,0,1,2,3
0,A01002,we should ban human cloning as it will only ca...,"[tensor(101), tensor(2057), tensor(2323), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0,0,1,0
1,A01005,fast food should be banned because it is reall...,"[tensor(101), tensor(3435), tensor(2833), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0,0,1,0
2,A01006,sometimes economic sanctions are the only thin...,"[tensor(101), tensor(2823), tensor(3171), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0,1,1,0
3,A01007,capital punishment is sometimes the only optio...,"[tensor(101), tensor(3007), tensor(7750), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0,0,1,1
4,A01008,factory farming allows for the production of c...,"[tensor(101), tensor(4713), tensor(7876), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0,0,1,1


In [22]:
df_combined.columns

Index(['Argument_ID', 'processed_text', 'input_ids', 'attention_mask', 0, 1, 2,
       3],
      dtype='object')

BERT MODEL

In [24]:


# Split the data into training and validation sets
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text for training and validation sets
train_encodings = tokenizer(list(train_df['processed_text']), truncation=True, padding='max_length', max_length=64, return_tensors='pt')
val_encodings = tokenizer(list(val_df['processed_text']), truncation=True, padding='max_length', max_length=64, return_tensors='pt')

# Convert labels to tensors with binary values (0 or 1) instead of probability distributions
train_labels = torch.tensor(train_df.iloc[:, 4:].values, dtype=torch.float32)  
val_labels = torch.tensor(val_df.iloc[:, 4:].values, dtype=torch.float32)

# Create datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_combined.columns) - 4)  

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and loss function, here we start from a small learning rate 
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Training loop
num_epochs = 5 # probably should be decreased to 3 because it takes too long
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for input_ids, attention_mask, labels in train_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {average_loss:.4f}')

    # Validation loop with accuracy calculation
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in val_dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Threshold logits to get binary predictions
            predictions = torch.sigmoid(logits) > 0.5

            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.numel()

        accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Accuracy: {accuracy:.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training Loss: 0.6137
Epoch 1/5 - Validation Accuracy: 0.7247
Epoch 2/5 - Training Loss: 0.5334
Epoch 2/5 - Validation Accuracy: 0.7509
Epoch 3/5 - Training Loss: 0.4651
Epoch 3/5 - Validation Accuracy: 0.7662
Epoch 4/5 - Training Loss: 0.3941
Epoch 4/5 - Validation Accuracy: 0.7532
Epoch 5/5 - Training Loss: 0.3205
Epoch 5/5 - Validation Accuracy: 0.7631


BERT classifier performed better compared to the CNN classifier. This is due to the fact that BERT typically uses the Binary Cross-Entropy with Logits loss function for classification tasks, which expects binary labels and thus the previous probability distribution labels had to be transformed into binary vectors as well. 

to-do:
- confusion matrix
- classification report
- plots