In [55]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [87]:
data_file= 'data/dataset.pkl'

In [88]:
party_dict={'SPÖ':0,'ÖVP':1,'FPÖ':2,'Grüne':3,'LIF':4,'BZÖ':5,'NEOS':6,'STRONACH':7,'PILZ':8,'independent':9}

In [89]:
def load_data(data_file):
    df = pd.read_pickle(data_file)
    independent_df= df[df['party']=='independent']
    df = df[~(df['party']=='independent')]
    #df= df[:100]
    #print(df['party'])
    texts = df['text'].tolist()
    df['party'] = df['party'].map(party_dict)
    labels = df['party'].tolist()
    test_texts= independent_df['text'].tolist()
    print(df.value_counts('party'))
    #print(labels)
    return texts, labels, test_texts

In [90]:
texts,labels,test_texts = load_data(data_file)

party
0    24747
1    24063
2    17308
3    12455
5     4170
4     1942
6     1866
7     1322
8      346
dtype: int64


[24747/87883,24063/87883,17308/87883,12455/87883,1942/87883,4170/87883,1866/87883,1322/87883,346/87883]

In [91]:
'''
takes a list of texts and a list of labels and returns a tensor with tokenzied text and labels
truncated with given maximal length and using predefined tokenizer
'''
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label,dtype=torch.long)}

In [92]:
'''
BERT Classifier with a BERT layer, Dropout layer and a linear layer
'''
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [93]:
def train(model, data_loader, optimizer, scheduler, device, weights):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss(weight=weights)(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [94]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    all_probs = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probs = nn.functional.softmax(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
            all_probs.extend(probs.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions), all_probs

In [95]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
    return preds.item()

In [105]:
# Set up parameters
#bert_model_name = 'bert-base-uncased'
bert_model_name= 'bert-base-german-cased'
#n-classes depends on sample size, actually 10
num_classes = 9
max_length = 128
batch_size = 10
num_epochs = 2
learning_rate = 2e-5
#because the data set is not balanced n sample to implement weights in loss function
#class_weights= torch.tensor([31/100,23/100,25/100,21/100])
class_weights = torch.tensor([24747/87883,24063/87883,17308/87883,12455/87883,1942/87883,4170/87883,1866/87883,1322/87883,346/87883])

In [100]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [101]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [102]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [103]:
#not sure if SGD would be better?
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device, class_weights)
        accuracy, report, probabilities = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)
        print(probabilities)

Epoch 1/2


In [71]:
torch.save(model.state_dict(), "data/bert_classifier.pth")

In [72]:
# Test sentiment."
test_text= "hallo."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted party: {sentiment}")

hallo.
Predicted party: 0
