In [943]:
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import torch
import numpy as np
import shutil

# https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
# https://huggingface.co/docs/transformers/en/training
# https://www.youtube.com/watch?v=TmT-sKxovb0
# https://www.youtube.com/watch?v=f-86-HcYYi8
# https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=zHxRRzqpBf76
# https://huggingface.co/docs/transformers/model_doc/distilbert#usage-tips

# Read the dataset

In [944]:
file_path = '04_comments_annotated-values.csv'
data = pd.read_csv(file_path, delimiter=';')
print(data.head())

                  c_id                                             c_text  \
0  1391717608802631681  Wer so ein Profilbild wie sie hochlädt kann nu...   
1  1389188826799673345  Willst du jetzt etwa behaupten das Querdenker ...   
2  1385241285645291521  Aber Bild hat doch gerade deswegen und diesbez...   
3  1385240437988986887  Das sehe ich auch genau so. Dieser Brinkhaus i...   
4  1389640445790199809  Mit den Milliarden Unterstützungsgeldern die s...   

               date  conv_id reaction1 reaction2  Generalisation  Ambiguous  \
0  10.05.2021 11:31      NaN       NaN       NaN             3.0        0.0   
1  03.05.2021 12:03      NaN       NaN       NaN             3.0        1.0   
2  22.04.2021 14:37      NaN       NaN       NaN             1.0        0.0   
3  22.04.2021 14:33      NaN       NaN       NaN             0.0        1.0   
4  04.05.2021 17:57      NaN       NaN       NaN             2.0        0.0   

   Objective  Subjective  Disputed  
0        0.0         1.0 

In [945]:
# TODO: Correctly handle reactions
data.drop(labels=['c_id', 'date', 'conv_id', 'reaction1', 'reaction2'], axis=1, inplace=True)

In [946]:
data.head()

Unnamed: 0,c_text,Generalisation,Ambiguous,Objective,Subjective,Disputed
0,Wer so ein Profilbild wie sie hochlädt kann nu...,3.0,0.0,0.0,1.0,0.0
1,Willst du jetzt etwa behaupten das Querdenker ...,3.0,1.0,0.0,1.0,0.0
2,Aber Bild hat doch gerade deswegen und diesbez...,1.0,0.0,1.0,1.0,0.0
3,Das sehe ich auch genau so. Dieser Brinkhaus i...,0.0,1.0,0.0,1.0,0.0
4,Mit den Milliarden Unterstützungsgeldern die s...,2.0,0.0,0.0,1.0,0.0


# Process the data

In [947]:
target_list = ['Generalisation', 'Ambiguous', 'Objective', 'Subjective', 'Disputed']

In [948]:
MAX_LEN = 300
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-05

In [949]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def get_encodings(text):
    text = str(text)
    text = " ".join(text.split())
    
    encodings = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
    
    return encodings['input_ids'], encodings['attention_mask'], encodings['token_type_ids']

In [950]:
class CustomDataset (torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.title = self.df['c_text']
        self.targets = self.df[target_list].values
        
    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, index):        
        input_ids, attention_mask, token_type_ids = get_encodings(self.title[index])
        
        return {
            'input_ids': input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'token_type_ids': token_type_ids.flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Generate the dataset

In [951]:
train_size = 0.8
data = data[0:200]
print(data.shape)
train_data = data.sample(frac=train_size, random_state=200).reset_index(drop=True)
validation_data = data.drop(train_data.index).reset_index(drop=True)

(200, 6)


In [952]:
train_data = CustomDataset(train_data, tokenizer, MAX_LEN)
validation_data = CustomDataset(validation_data, tokenizer, MAX_LEN)

In [953]:
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

# Create model

In [954]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [955]:
def load_checkpoint(filepath, model, optimizer):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [956]:
def save_checkpoint(state, is_best, file_path, best_model_path):
    torch.save(state, file_path)
    if is_best:
        shutil.copy(file_path, best_model_path)

In [957]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 5)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        
        return output

In [958]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [959]:
def loss_function(output, target):
    return torch.nn.BCEWithLogitsLoss()(output, target)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [960]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
    valid_loss_min = np.Inf
    for epoch in range(1, n_epochs+1):
        train_loss = 0.0
        valid_loss = 0.0
        
        # Training Loop
        model.train()
        for index, batch in enumerate(training_loader, 0):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            optimizer.zero_grad()
            loss = loss_function(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += (1/(index+1))*(loss.item()-train_loss)
            
        # Validation Loop
        model.eval()
        with torch.no_grad():
            for index, batch in enumerate(validation_loader, 0):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.float)
                
                outputs = model(input_ids, attention_mask, token_type_ids)
                
                loss = loss_function(outputs, targets)
                valid_loss += (1/(index+1))*(loss.item()-valid_loss)
                
        checkpoint = {
            'epoch': epoch,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        if valid_loss < valid_loss_min:
            valid_loss_min = valid_loss
            save_checkpoint(checkpoint, True, checkpoint_path, best_model_path)
        else:
            save_checkpoint(checkpoint, False, checkpoint_path, best_model_path)
        
        print(f'Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}')
    return model

# Train and evaluate

In [961]:
CHECKPOINT_PATH = './model/checkpoint.pth'
BEST_MODEL_PATH = './model/best_model.pth'
trained_model = train_model(EPOCHS, train_data_loader, validation_data_loader, model, optimizer, CHECKPOINT_PATH, BEST_MODEL_PATH)

Epoch: 1 	Training Loss: 0.6599182605743408 	Validation Loss: 0.6414114236831665
Epoch: 2 	Training Loss: 0.6209587097167969 	Validation Loss: 0.6079378128051758
Epoch: 3 	Training Loss: 0.574800455570221 	Validation Loss: 0.569420576095581


# Read the model

In [962]:
pred_model = BERTClass()
pred_model.to(device)
model = load_checkpoint(BEST_MODEL_PATH, pred_model, optimizer)

In [963]:
pred_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Get Predictions

In [964]:
def get_prediction(text): 
    input_ids, attention_mask, token_type_ids = get_encodings(text)
    
    pred_model.eval()
    with torch.no_grad():
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        
        outputs = pred_model(input_ids, attention_mask, token_type_ids)
        
        print(outputs)

In [965]:
get_prediction("Seit 1960 steigt jedes Jahr die Zahl der Eisbären, Ozonloch über Arktis weg. Tote von Naturkatastrophen so wenig wie nie. Nie soviel Wald wie jetzt. Der Welt geht's saugut. Nur einzelne schieben Panik und die Medien steigen ein, verkauft sich eben gut.")
get_prediction("Der Impf-Apartheider? Hattest du Whisky zum Frühstück?")

tensor([[ 0.5186,  0.0028, -0.2233,  0.3565, -0.5389]])
tensor([[ 0.4010,  0.0471, -0.2812,  0.3705, -0.4758]])
