In [1]:
import os
from huggingface_hub import hf_hub_download
import torch
import pandas as pd
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import time
from tqdm import tqdm

pd.set_option("display.max_columns", 400)
pd.set_option("display.max_rows", 400)
pd.set_option("display.max_colwidth", None)

In [2]:
with open('HF_token.txt') as f:
    contents = f.readline()
    HUGGING_FACE_API_KEY = contents.rstrip()

In [3]:
import json
with open("speechIds.txt") as f:
    contents = f.readline()
    selected_speeches = json.loads(contents.rstrip())

In [4]:
df = pd.read_feather("data/sentences_climate.feather")
df = df[df["electoral_term"] == 19]
df.head(1)

Unnamed: 0,level_0,sent_id,speech_id,name,electoral_term,party,role,date,session,sentence_no,sentence_length,sentence
0,1048590,1539199,63255,Sebastian Brehm,19,CSU,mp,2021-03-04,215,24,13,"Für Sie gibt es keine Klimakrise, also wollen Sie alle diese Mehrkosten abschaffen."


In [5]:
df.shape

(10989, 12)

In [6]:
# Agenda points with more than 15 or 16 items: 7296 sentences
# more than 80% relevancy, all speeches in 42357 sentences
df.shape[0]

10989

In [7]:
# load model
tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT")
model = AutoModelForSequenceClassification.from_pretrained("luerhard/PopBERT")

In [9]:
try:
    preds = pd.read_csv("popbert_predictions.csv", header=None)
except:
    preds = [[]]
df_selected = df[~(df["sent_id"].isin(preds[0]))]
df_selected = df_selected[["sent_id","speech_id", "sentence"]]
print("already in csv file: ", df[(df["sent_id"].isin(preds[0]))].shape[0])
print("df size:", len(df_selected))
print("df.head(1):", df_selected.head(1))

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
with torch.inference_mode():    
    for batch in tqdm(chunker(df_selected,10), total=int(df_selected.shape[0]/10)):
        ids = batch["sent_id"].to_list()
        speech_ids = batch["speech_id"].to_list()
        text = tuple(batch["sentence"].to_list())
        #truncation=True, padding='max_length', max_length=512, 
        encodings = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

        out = model(**encodings)
        proba_tensor = torch.nn.functional.sigmoid(out.logits)
        probas = proba_tensor.cpu().detach().numpy()

        new_preds = []
        for (
            id_,
            speech_id_,
            pred,
        ) in zip(ids, speech_ids, probas):
            row = {
                "sent_id": id_,
                "speech_id": speech_id_,
                "elite": pred[0].astype(float),
                "pplcentr": pred[1].astype(float),
                "left": pred[2].astype(float),
                "right": pred[3].astype(float),
            }
            new_preds.append(row)
        pd.DataFrame(new_preds).to_csv("popbert_predictions.csv", mode="a", index=False, header=False)
        time.sleep(4)

already in csv file:  280
df size: 10709
df.head(1):      sent_id  speech_id  \
445  1340310      54554   

                                                                                                                                                           sentence  
445  Wir müssen unsere Häfen zu Energiehäfen weiterentwickeln, vor allem, wenn wir klimafreundlichen Wasserstoff aus Nordafrika oder Australien importieren wollen.  


 99%|█████████████████████████████████████████████████████████████████████████████████████████▋ | 1055/1070 [11:50:15<10:05, 40.39s/it]


KeyboardInterrupt: 