In [None]:
from enum import Enum
from transformers import BertTokenizer, TFBertModel

import re
import torch

In [None]:
class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5

In [None]:
party_text = {}
for party in Party:
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
    party_text[party] = file

In [None]:
party_sentences = {p: re.sub('[\x0c|\n]','',t).split('.') for p,t in party_text.items()}

In [None]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
model = TFBertModel.from_pretrained("dbmdz/bert-base-german-uncased")

In [None]:
tokens = {p: {'input_ids': [], 'attention_mask': []} for p in Party}

for party in Party:
    for sentence in party_sentences[party]:
        new_tokens = tokenizer.encode_plus(sentence, truncation=True, padding='max_length', return_tensors='pt')
        tokens[party]['input_ids'].append(new_tokens['input_ids'][0])
        tokens[party]['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens[party]['input_ids'] = torch.stack(tokens[party]['input_ids'])
    tokens[party]['attention_mask'] = torch.stack(tokens[party]['attention_mask'])

In [None]:
outputs = model(**tokens)
embeddings = outputs.last_hidden_state