In [134]:
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import torch
import numpy as np
import shutil
from timeit import default_timer as timer
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy

# Read the dataset

In [135]:
file_path = 'dataset/04_comments_annotated-values.csv'
data = pd.read_csv(file_path, delimiter=';')

In [136]:
data.head()

Unnamed: 0,c_id,c_text,date,conv_id,reaction1,reaction2,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
0,1391717608802631681,Wer so ein Profilbild wie sie hochlädt kann nu...,10.05.2021 11:31,,,,0.0,0.0,1.0,0.0,0,0,0,1
1,1389188826799673345,Willst du jetzt etwa behaupten das Querdenker ...,03.05.2021 12:03,,,,1.0,0.0,1.0,0.0,0,0,0,1
2,1385241285645291521,Aber Bild hat doch gerade deswegen und diesbez...,22.04.2021 14:37,,,,0.0,1.0,1.0,0.0,0,1,0,0
3,1385240437988986887,Das sehe ich auch genau so. Dieser Brinkhaus i...,22.04.2021 14:33,,,,1.0,0.0,1.0,0.0,1,0,0,0
4,1389640445790199809,Mit den Milliarden Unterstützungsgeldern die s...,04.05.2021 17:57,,,,0.0,0.0,1.0,0.0,0,0,1,0


## Remove test-data

In [137]:
test_data_ids = [
    1405047474310553601,
    1388783647403220994,
    1387373196916281344,
    1405544796559601673,
    1401594561835773957,
    1405219910960156675,
    1394285068944818176,
    1405106839642062848,
    1403073727332560902,
    1406556368241369090,
    1384598185356767235,
    1398590532167864320,
    1393517517927223301,
    1404378018928267264,
    1387505263159218179
]

test_data = data[data['c_id'].isin(test_data_ids)]
data = data[~data['c_id'].isin(test_data_ids)]

In [138]:
test_data.head(15)

Unnamed: 0,c_id,c_text,date,conv_id,reaction1,reaction2,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
1485,1405047474310553601,"Nein, nicht in einen Tiopf. Die Mitesser-NGO G...",16.06.2021 06:19,1.40488e+18,Ich werte das als impulsgetiebene Antwort. Bit...,Jeder Mensch mit klarem Verstand erkennt bei d...,1.0,0.0,1.0,1.0,1,0,0,0
1487,1393517517927223301,"Ääähm... da müsste man halt jetzt wissen, dass...",15.05.2021 10:44,1.393101e+18,100%. Ich hoffe er schreibt es sich hinter sei...,Es ist immer wieder schön zu sehen und zu höre...,0.0,1.0,1.0,0.0,0,1,0,0
1492,1388783647403220994,"Wer liest das? Mal ehrlich jetzt, wenn dieses ...",02.05.2021 09:13,1.38821e+18,"Sie fragen, weshalb er ein Faschist sei. Ich b...",Dann sagen Sie es mir?,1.0,0.0,1.0,1.0,0,1,0,0
1493,1387373196916281344,"Im Gegensatz etwa zum ""Flügel"" der AfD damals....",28.04.2021 11:48,1.387325e+18,Jetzt wird's aber sehr beliebig. Sind denn der...,Das negieren des staatlichen Gewaltmonopols is...,0.0,1.0,1.0,1.0,1,0,0,0
1505,1404378018928267264,"Biergarten ist gesellschaftlich notwendiger, g...",14.06.2021 09:59,1.403864e+18,Du scheinst dich sehr auf Herrn Reitschuster e...,Dafür hattet ihr ne Ausgangssperre 😉,0.0,1.0,1.0,1.0,0,0,1,0
1526,1405544796559601673,ich sage nicht dass wissenschaft gekauft ist. ...,17.06.2021 15:16,1.40488e+18,das problem ist dass die industrielle landwirt...,"Warum ist es ""das Übel""? Mit vorindustriellen ...",0.0,1.0,1.0,1.0,0,1,0,0
1529,1387505263159218179,"Corona gibt es sehr wohl, aber wo stecken sich...",28.04.2021 20:33,1.387325e+18,"Ja, aber genau das ist doch das Problem. Die L...",Ja aber warum müssen denn alle seit einem Jahr...,0.0,1.0,1.0,1.0,0,0,1,0
1545,1401594561835773957,Die brauen Rattenfänger bleiben außen vor und ...,06.06.2021 17:39,1.401583e+18,"Jo, das stimmt. Aber dass die Bearbock Crew im...",Btw..wieviel hat die Afd verloren?,1.0,0.0,1.0,1.0,1,0,0,0
1556,1405219910960156675,Dann waren es wohl die falschen Medien..,16.06.2021 17:45,1.405213e+18,"Meinen Sie verantwortungslose Eltern, die den ...",Welche sind denn die richtigen Medien?,0.0,0.0,1.0,1.0,0,1,0,0
1561,1394285068944818176,Aber das macht doch die Situation für Juden HI...,17.05.2021 13:33,1.393929e+18,"Ja, aber nur auf dem Papier!","Ja, Deutsche auf dem ""Papier""!",0.0,0.0,1.0,1.0,0,0,1,0


## Prepare data structure

In [139]:
data.drop(labels=['c_id', 'date', 'conv_id'], axis=1, inplace=True)

In [140]:
data.head()

Unnamed: 0,c_text,reaction1,reaction2,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
0,Wer so ein Profilbild wie sie hochlädt kann nu...,,,0.0,0.0,1.0,0.0,0,0,0,1
1,Willst du jetzt etwa behaupten das Querdenker ...,,,1.0,0.0,1.0,0.0,0,0,0,1
2,Aber Bild hat doch gerade deswegen und diesbez...,,,0.0,1.0,1.0,0.0,0,1,0,0
3,Das sehe ich auch genau so. Dieser Brinkhaus i...,,,1.0,0.0,1.0,0.0,1,0,0,0
4,Mit den Milliarden Unterstützungsgeldern die s...,,,0.0,0.0,1.0,0.0,0,0,1,0


# Process the data

In [141]:
target_list = ['Generalisation0', 'Generalisation1', 'Generalisation2', 'Generalisation3', 'Ambiguous', 'Objective', 'Subjective', 'Disputed']

In [142]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 1e-05

In [143]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def get_encodings(text, reaction1, reaction2):
    text = str(text)
    text = " ".join(text.split())
    
    reaction1 = str(reaction1)
    reaction1 = " ".join(reaction1.split())
    
    reaction2 = str(reaction2)
    reaction2 = " ".join(reaction2.split())
    
    encodings = tokenizer.__call__(
            text + ' [SEP] ' + reaction1 + ' ' + reaction2, 
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
    
    return encodings['input_ids'], encodings['attention_mask'], encodings['token_type_ids']

In [144]:
class CustomDataset (torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.title = self.df['c_text']
        self.reaction1 = self.df['reaction1']
        self.reaction2 = self.df['reaction2']
        self.targets = self.df[target_list].values
        
    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, index):        
        input_ids, attention_mask, token_type_ids = get_encodings(self.title[index], self.reaction1[index], self.reaction2[index])
        
        return {
            'input_ids': input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'token_type_ids': token_type_ids.flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Generate the dataset

In [145]:
train_size = 0.8
print(data.shape)
train_data = data.sample(frac=train_size, random_state=200)
validation_data = data.drop(train_data.index)

train_data = train_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)

(200, 11)


In [146]:
train_data = CustomDataset(train_data, tokenizer, MAX_LEN)
validation_data = CustomDataset(validation_data, tokenizer, MAX_LEN)

In [147]:
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

# Create model

In [148]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [149]:
def load_checkpoint(filepath, model, optimizer):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [150]:
def save_checkpoint(state, is_best, file_path, best_model_path):
    torch.save(state, file_path)
    if is_best:
        shutil.copy(file_path, best_model_path)

In [151]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 8)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pooler = self.classifier(pooler)
        output = self.sigmoid(pooler)
        
        return output

In [152]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [153]:
def loss_function(output, target):
    return torch.nn.BCEWithLogitsLoss()(output, target)

def accuracy_function(output, target):
    mlaNone = MultilabelAccuracy(num_labels=8, average='none', threshold=0.5)
    mlaMiro = MultilabelAccuracy(num_labels=8, average='micro', threshold=0.5)
    mlaMacro = MultilabelAccuracy(num_labels=8, average='macro', threshold=0.5)
    mlaWeighted = MultilabelAccuracy(num_labels=8, average='weighted', threshold=0.5)
    return f'Accuracy: \tNone: {mlaNone(output, target)} \tMicro: {mlaMiro(output, target)} \tMacro: {mlaMacro(output, target)} \tWeighted: {mlaWeighted(output, target)}'

def f1_score_function(output, target):
    mlf1None = MultilabelF1Score(num_labels=8, average='none', threshold=0.5)
    mlf1Miro = MultilabelF1Score(num_labels=8, average='micro', threshold=0.5)
    mlf1Macro = MultilabelF1Score(num_labels=8, average='macro', threshold=0.5)
    mlf1Weighted = MultilabelF1Score(num_labels=8, average='weighted', threshold=0.5)
    return f'F1 Score: \tNone: {mlf1None(output, target)} \tMicro: {mlf1Miro(output, target)} \tMacro: {mlf1Macro(output, target)} \tWeighted: {mlf1Weighted(output, target)}'

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [154]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
    valid_loss_min = np.Inf
    model_start_time = timer()
    for epoch in range(1, n_epochs+1):
        epoch_start_time = timer()
        train_loss = 0.0
        valid_loss = 0.0
        
        # Training Loop
        model.train()
        
        train_output = []
        train_target = []
        
        for index, batch in enumerate(training_loader, 0):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            optimizer.zero_grad()
            loss = loss_function(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += (1/(index+1))*(loss.item()-train_loss)
            
            train_output.append(outputs)
            train_target.append(targets)
            
        train_accuracy = accuracy_function(torch.cat(train_output), torch.cat(train_target))
        train_f1_score = f1_score_function(torch.cat(train_output), torch.cat(train_target))
            
        # Validation Loop
        model.eval()
        
        validation_output = []
        validation_target = []
        
        with torch.no_grad():
            for index, batch in enumerate(validation_loader, 0):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.float)
                
                outputs = model(input_ids, attention_mask, token_type_ids)
                
                loss = loss_function(outputs, targets)
                valid_loss += (1/(index+1))*(loss.item()-valid_loss)
                
                validation_output.append(outputs)
                validation_target.append(targets)
                
        valid_accuracy = accuracy_function(torch.cat(validation_output), torch.cat(validation_target))
        valid_f1_score = f1_score_function(torch.cat(validation_output), torch.cat(validation_target))
                
        checkpoint = {
            'epoch': epoch,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        if valid_loss < valid_loss_min:
            valid_loss_min = valid_loss
            save_checkpoint(checkpoint, True, checkpoint_path, best_model_path)
        else:
            save_checkpoint(checkpoint, False, checkpoint_path, best_model_path)
        
        print(f'\n\nEpoch: {epoch} \tTime: {int(timer()-epoch_start_time)}s \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}')
        print(f'\tTraining: \n\t\t{train_accuracy} \n\t\t{train_f1_score}')
        print(f'\tValidation: \n\t\t{valid_accuracy} \n\t\t{valid_f1_score}')
        
    print(f'\n\nTraining time: {int(timer()-model_start_time)}s')
    return model

# Train and evaluate

In [155]:
CHECKPOINT_PATH = './model/checkpoint.pth'
BEST_MODEL_PATH = './model/best_model.pth'
trained_model = train_model(EPOCHS, train_data_loader, validation_data_loader, model, optimizer, CHECKPOINT_PATH, BEST_MODEL_PATH)



Epoch: 1 	Time: 337s 	Training Loss: 0.8070988893508911 	Validation Loss: 0.7886082231998444
	Training: 
		Accuracy: 	None: tensor([0.7063, 0.4875, 0.6812, 0.3063, 0.4563, 0.6187, 0.9375, 0.7250]) 	Micro: 0.6148437261581421 	Macro: 0.6148437261581421 	Weighted: 0.699698805809021 
		F1 Score: 	None: tensor([0.0000, 0.3881, 0.0000, 0.0826, 0.6201, 0.2278, 0.9677, 0.0000]) 	Micro: 0.5142857432365417 	Macro: 0.28579771518707275 	Weighted: 0.5257154107093811
	Validation: 
		Accuracy: 	None: tensor([0.5000, 0.6500, 0.9000, 0.9500, 0.5500, 0.7000, 0.9250, 1.0000]) 	Micro: 0.7718750238418579 	Macro: 0.7718750238418579 	Weighted: 0.7146396636962891 
		F1 Score: 	None: tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.7097, 0.0000, 0.9610, 0.0000]) 	Micro: 0.6178010702133179 	Macro: 0.2088395357131958 	Weighted: 0.4610031247138977


Epoch: 2 	Time: 328s 	Training Loss: 0.7996280908584594 	Validation Loss: 0.781091719865799
	Training: 
		Accuracy: 	None: tensor([0.7063, 0.6562, 0.6812, 0.8250, 0.4688, 

# Read the model

In [156]:
pred_model = BERTClass()
pred_model.to(device)
model = load_checkpoint(BEST_MODEL_PATH, pred_model, optimizer)

In [157]:
pred_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Get Predictions

In [158]:
def get_prediction(row): 
    input_ids, attention_mask, token_type_ids = get_encodings(row["c_text"], row["reaction1"], row["reaction2"])
    
    pred_model.eval()
    with torch.no_grad():
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        
        outputs = pred_model(input_ids, attention_mask, token_type_ids)
        
        generalisation_pred_value = 0
        generalisation_pred_prop = 0
        if outputs[0][0].item() > generalisation_pred_prop:
            generalisation_pred_value = 0
            generalisation_pred_prop = outputs[0][0].item()
        
        if outputs[0][1].item() > generalisation_pred_prop:
            generalisation_pred_value = 1
            generalisation_pred_prop = outputs[0][1].item()
            
        if outputs[0][2].item() > generalisation_pred_prop:
            generalisation_pred_value = 2
            generalisation_pred_prop = outputs[0][2].item()
        
        if outputs[0][3].item() > generalisation_pred_prop:
            generalisation_pred_value = 3
        
        row['Generalisation_Pred'] = generalisation_pred_value
        row['Ambiguous_Pred'] = 1 if outputs[0][4].item() > 0.5 else 0
        row['Objective_Pred'] = 1 if outputs[0][5].item() > 0.5 else 0
        row['Subjective_Pred'] = 1 if outputs[0][6].item() > 0.5 else 0
        row['Disputed_Pred'] = 1 if outputs[0][7].item() > 0.5 else 0
        
        return row

In [159]:
data_pred = []
for index, row in test_data.iterrows():
    data_pred.append(get_prediction(row))
    
data_pred = pd.DataFrame(data_pred)
data_pred.head()

Unnamed: 0,c_id,c_text,date,conv_id,reaction1,reaction2,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3,Generalisation_Pred,Ambiguous_Pred,Objective_Pred,Subjective_Pred,Disputed_Pred
1485,1405047474310553601,"Nein, nicht in einen Tiopf. Die Mitesser-NGO G...",16.06.2021 06:19,1.40488e+18,Ich werte das als impulsgetiebene Antwort. Bit...,Jeder Mensch mit klarem Verstand erkennt bei d...,1.0,0.0,1.0,1.0,1,0,0,0,1,1,0,1,0
1487,1393517517927223301,"Ääähm... da müsste man halt jetzt wissen, dass...",15.05.2021 10:44,1.393101e+18,100%. Ich hoffe er schreibt es sich hinter sei...,Es ist immer wieder schön zu sehen und zu höre...,0.0,1.0,1.0,0.0,0,1,0,0,1,1,0,1,0
1492,1388783647403220994,"Wer liest das? Mal ehrlich jetzt, wenn dieses ...",02.05.2021 09:13,1.38821e+18,"Sie fragen, weshalb er ein Faschist sei. Ich b...",Dann sagen Sie es mir?,1.0,0.0,1.0,1.0,0,1,0,0,3,1,0,1,0
1493,1387373196916281344,"Im Gegensatz etwa zum ""Flügel"" der AfD damals....",28.04.2021 11:48,1.387325e+18,Jetzt wird's aber sehr beliebig. Sind denn der...,Das negieren des staatlichen Gewaltmonopols is...,0.0,1.0,1.0,1.0,1,0,0,0,1,1,0,1,0
1505,1404378018928267264,"Biergarten ist gesellschaftlich notwendiger, g...",14.06.2021 09:59,1.403864e+18,Du scheinst dich sehr auf Herrn Reitschuster e...,Dafür hattet ihr ne Ausgangssperre 😉,0.0,1.0,1.0,1.0,0,0,1,0,1,1,0,1,0


In [160]:
data_pred.to_csv('dataset/05_predictions-for-test-comments.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 