# Political parties

In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from cleantext import clean

In [None]:
comments_df = pd.read_csv("data/youtube_comments_500.csv")
comments_df["Comments"] = comments_df["Comments"].astype(str)

def clean_text(text:str):
    text = clean(text, no_emoji=True, lang="de")
    new_text = []
    for token in text.split(" "):
        if not token.startswith('@') and not token.startswith('http'):
            new_text.append(token)
    return " ".join(new_text)

comments_df["Comments"] = comments_df["Comments"].apply(lambda text: clean_text(text))
comments_df =comments_df[comments_df.Comments != ""]
comments_df

In [29]:
model_name = "UHH-CI/GermanPolitical-Gelectra-base"
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model_config = AutoConfig.from_pretrained(model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, from_tf=True)

label_list = []
score_list = []

political_parties = ["cdu/csu","spd","fdp","grüne","die linke","afd"]

for text in tqdm(comments_df.Comments.to_list()):
    top_party_name = ""
    top_party_score = 0
    for party in political_parties:
        input_text = party+": "+text
        tokenized_input = sentiment_tokenizer(input_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
        output = sentiment_model(**tokenized_input)
        prediction = torch.nn.functional.softmax(output.logits, dim=-1)
        prediction = prediction.detach().numpy()[0]
        if top_party_score < prediction[0]: # label[0] indicates the value for the consent
            top_party_name = party
            top_party_score = prediction[0]

    label_list.append(top_party_name)
    score_list.append(top_party_score)

comments_df["political_party"] = label_list
comments_df["political_party_score"] = score_list
comments_df.to_csv("data/political_parties.csv")

All TF 2.0 model weights were used when initializing ElectraForSequenceClassification.

All the weights of ElectraForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElectraForSequenceClassification for predictions without further training.
  5%|▌         | 20280/401282 [2:45:15<51:44:52,  2.05it/s] 


KeyboardInterrupt: 