In [230]:
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import torch
import numpy as np
import shutil
from timeit import default_timer as timer
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy

# https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
# https://huggingface.co/docs/transformers/en/training
# https://www.youtube.com/watch?v=TmT-sKxovb0
# https://www.youtube.com/watch?v=f-86-HcYYi8
# https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=zHxRRzqpBf76
# https://huggingface.co/docs/transformers/model_doc/distilbert#usage-tips

# Read the dataset

In [231]:
file_path = '04_comments_annotated-values.csv'
data = pd.read_csv(file_path, delimiter=';')

In [232]:
data.head()

Unnamed: 0,c_id,c_text,date,conv_id,reaction1,reaction2,Generalisation,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
0,1391717608802631681,Wer so ein Profilbild wie sie hochlädt kann nu...,10.05.2021 11:31,,,,3.0,0.0,0.0,1.0,0.0,0,0,0,1
1,1389188826799673345,Willst du jetzt etwa behaupten das Querdenker ...,03.05.2021 12:03,,,,3.0,1.0,0.0,1.0,0.0,0,0,0,1
2,1385241285645291521,Aber Bild hat doch gerade deswegen und diesbez...,22.04.2021 14:37,,,,1.0,0.0,1.0,1.0,0.0,0,1,0,0
3,1385240437988986887,Das sehe ich auch genau so. Dieser Brinkhaus i...,22.04.2021 14:33,,,,0.0,1.0,0.0,1.0,0.0,1,0,0,0
4,1389640445790199809,Mit den Milliarden Unterstützungsgeldern die s...,04.05.2021 17:57,,,,2.0,0.0,0.0,1.0,0.0,0,0,1,0


In [233]:
data.drop(labels=['c_id', 'date', 'conv_id'], axis=1, inplace=True)

In [234]:
data.head()

Unnamed: 0,c_text,reaction1,reaction2,Generalisation,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
0,Wer so ein Profilbild wie sie hochlädt kann nu...,,,3.0,0.0,0.0,1.0,0.0,0,0,0,1
1,Willst du jetzt etwa behaupten das Querdenker ...,,,3.0,1.0,0.0,1.0,0.0,0,0,0,1
2,Aber Bild hat doch gerade deswegen und diesbez...,,,1.0,0.0,1.0,1.0,0.0,0,1,0,0
3,Das sehe ich auch genau so. Dieser Brinkhaus i...,,,0.0,1.0,0.0,1.0,0.0,1,0,0,0
4,Mit den Milliarden Unterstützungsgeldern die s...,,,2.0,0.0,0.0,1.0,0.0,0,0,1,0


# Process the data

In [235]:
target_list = ['Generalisation0', 'Generalisation1', 'Generalisation2', 'Generalisation3', 'Ambiguous', 'Objective', 'Subjective', 'Disputed']

In [236]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-05

In [237]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def get_encodings(text, reaction1, reaction2):
    text = str(text)
    text = " ".join(text.split())
    
    reaction1 = str(reaction1)
    reaction1 = " ".join(reaction1.split())
    
    reaction2 = str(reaction2)
    reaction2 = " ".join(reaction2.split())
    
    encodings = tokenizer.encode_plus(
            text + ' [SEP] ' + reaction1 + ' ' + reaction2, 
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
    
    return encodings['input_ids'], encodings['attention_mask'], encodings['token_type_ids']

In [238]:
class CustomDataset (torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.title = self.df['c_text']
        self.reaction1 = self.df['reaction1']
        self.reaction2 = self.df['reaction2']
        self.targets = self.df[target_list].values
        
    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, index):        
        input_ids, attention_mask, token_type_ids = get_encodings(self.title[index], self.reaction1[index], self.reaction2[index])
        
        return {
            'input_ids': input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'token_type_ids': token_type_ids.flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Generate the dataset

In [239]:
train_size = 0.8
data = data[0:200]
print(data.shape)
train_data = data.sample(frac=train_size, random_state=200).reset_index(drop=True)
validation_data = data.drop(train_data.index).reset_index(drop=True)

(200, 12)


In [240]:
train_data = CustomDataset(train_data, tokenizer, MAX_LEN)
validation_data = CustomDataset(validation_data, tokenizer, MAX_LEN)

In [241]:
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

# Create model

In [242]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [243]:
def load_checkpoint(filepath, model, optimizer):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [244]:
def save_checkpoint(state, is_best, file_path, best_model_path):
    torch.save(state, file_path)
    if is_best:
        shutil.copy(file_path, best_model_path)

In [245]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 8)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pooler = self.classifier(pooler)
        output = self.sigmoid(pooler)
        
        return output

In [246]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [247]:
def loss_function(output, target):
    return torch.nn.BCEWithLogitsLoss()(output, target)

def accuracy_function(output, target):
    mlaNone = MultilabelAccuracy(num_labels=8, average='none', threshold=0.5)
    mlaMiro = MultilabelAccuracy(num_labels=8, average='micro', threshold=0.5)
    mlaMacro = MultilabelAccuracy(num_labels=8, average='macro', threshold=0.5)
    mlaWeighted = MultilabelAccuracy(num_labels=8, average='weighted', threshold=0.5)
    return f'Accuracy: \tNone: {mlaNone(output, target)} \tMicro: {mlaMiro(output, target)} \tMacro: {mlaMacro(output, target)} \tWeighted: {mlaWeighted(output, target)}'

def f1_score_function(output, target):
    mlf1None = MultilabelF1Score(num_labels=8, average='none', threshold=0.5)
    mlf1Miro = MultilabelF1Score(num_labels=8, average='micro', threshold=0.5)
    mlf1Macro = MultilabelF1Score(num_labels=8, average='macro', threshold=0.5)
    mlf1Weighted = MultilabelF1Score(num_labels=8, average='weighted', threshold=0.5)
    return f'F1 Score: \tNone: {mlf1None(output, target)} \tMicro: {mlf1Miro(output, target)} \tMacro: {mlf1Macro(output, target)} \tWeighted: {mlf1Weighted(output, target)}'

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [248]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
    valid_loss_min = np.Inf
    model_start_time = timer()
    for epoch in range(1, n_epochs+1):
        epoch_start_time = timer()
        train_loss = 0.0
        valid_loss = 0.0
        
        # Training Loop
        model.train()
        
        train_output = []
        train_target = []
        
        for index, batch in enumerate(training_loader, 0):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            optimizer.zero_grad()
            loss = loss_function(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += (1/(index+1))*(loss.item()-train_loss)
            
            train_output.append(outputs)
            train_target.append(targets)
            
        train_accuracy = accuracy_function(torch.cat(train_output), torch.cat(train_target))
        train_f1_score = f1_score_function(torch.cat(train_output), torch.cat(train_target))
            
        # Validation Loop
        model.eval()
        
        validation_output = []
        validation_target = []
        
        with torch.no_grad():
            for index, batch in enumerate(validation_loader, 0):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.float)
                
                outputs = model(input_ids, attention_mask, token_type_ids)
                
                loss = loss_function(outputs, targets)
                valid_loss += (1/(index+1))*(loss.item()-valid_loss)
                
                validation_output.append(outputs)
                validation_target.append(targets)
                
        valid_accuracy = accuracy_function(torch.cat(validation_output), torch.cat(validation_target))
        valid_f1_score = f1_score_function(torch.cat(validation_output), torch.cat(validation_target))
                
        checkpoint = {
            'epoch': epoch,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        if valid_loss < valid_loss_min:
            valid_loss_min = valid_loss
            save_checkpoint(checkpoint, True, checkpoint_path, best_model_path)
        else:
            save_checkpoint(checkpoint, False, checkpoint_path, best_model_path)
        
        print(f'\n\nEpoch: {epoch} \tTime: {int(timer()-epoch_start_time)}s \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}')
        print(f'\tTraining: \n\t\t{train_accuracy} \n\t\t{train_f1_score}')
        print(f'\tValidation: \n\t\t{valid_accuracy} \n\t\t{valid_f1_score}')
        
    print(f'\n\nTraining time: {int(timer()-model_start_time)}s')
    return model

# Train and evaluate

In [249]:
CHECKPOINT_PATH = './model/checkpoint.pth'
BEST_MODEL_PATH = './model/best_model.pth'
trained_model = train_model(EPOCHS, train_data_loader, validation_data_loader, model, optimizer, CHECKPOINT_PATH, BEST_MODEL_PATH)



Epoch: 1 	Time: 276s 	Training Loss: 0.8081438899040222 	Validation Loss: 0.796354204416275
	Training: 
		Accuracy: 	None: tensor([0.6750, 0.4563, 0.5500, 0.8250, 0.5312, 0.3688, 0.9187, 0.8813]) 	Micro: 0.6507812738418579 	Macro: 0.6507812738418579 	Weighted: 0.6744427680969238 
		F1 Score: 	None: tensor([0.0714, 0.3256, 0.5000, 0.0667, 0.3590, 0.3567, 0.9577, 0.0000]) 	Micro: 0.5338894724845886 	Macro: 0.329624205827713 	Weighted: 0.5487957000732422
	Validation: 
		Accuracy: 	None: tensor([0.5750, 0.6500, 0.7250, 0.9750, 0.5250, 0.7250, 0.9500, 1.0000]) 	Micro: 0.765625 	Macro: 0.7656249403953552 	Weighted: 0.738084077835083 
		F1 Score: 	None: tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9744, 0.0000]) 	Micro: 0.503311276435852 	Macro: 0.12179487198591232 	Weighted: 0.34603404998779297


Epoch: 2 	Time: 288s 	Training Loss: 0.7997676372528076 	Validation Loss: 0.7854457497596741
	Training: 
		Accuracy: 	None: tensor([0.7063, 0.6812, 0.6687, 0.9375, 0.5312, 0.7375, 0.9

# Read the model

In [250]:
pred_model = BERTClass()
pred_model.to(device)
model = load_checkpoint(BEST_MODEL_PATH, pred_model, optimizer)

In [251]:
pred_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Get Predictions

In [252]:
def get_prediction(text): 
    input_ids, attention_mask, token_type_ids = get_encodings(text, '', '')
    
    pred_model.eval()
    with torch.no_grad():
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        
        outputs = pred_model(input_ids, attention_mask, token_type_ids)
        
        print(outputs)

In [253]:
get_prediction("Seit 1960 steigt jedes Jahr die Zahl der Eisbären, Ozonloch über Arktis weg. Tote von Naturkatastrophen so wenig wie nie. Nie soviel Wald wie jetzt. Der Welt geht's saugut. Nur einzelne schieben Panik und die Medien steigen ein, verkauft sich eben gut.")
get_prediction("Der Impf-Apartheider? Hattest du Whisky zum Frühstück?")

tensor([[0.4407, 0.4253, 0.4599, 0.4223, 0.4672, 0.4386, 0.5555, 0.4122]])
tensor([[0.4390, 0.4483, 0.4492, 0.4221, 0.4617, 0.4304, 0.5604, 0.4117]])
