In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Lets create 300 examples at random based on pv debates

In [4]:
df = pd.read_csv('sampled_statements.csv')

df.head()

Unnamed: 0,statement
0,"Thank you, Mr. Vice President."
1,He got paid $1.6 million to do that. He said h...
2,So this was simply a question of endorsement p...
3,"And so I did everything I could, including bri..."
4,But on Iran -- on Iran -- on Iran what she sai...


In [5]:
tokenizer = AutoTokenizer.from_pretrained("mlburnham/Political_DEBATE_large_v1.0")
model = AutoModelForSequenceClassification.from_pretrained("mlburnham/Political_DEBATE_large_v1.0")
model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNo

In [10]:
labels = [
    "Foreign Policy and National Security",
    "Economy (jobs, growth, trade)",
    "Healthcare (health insurance, hospitals, medical care)",
    "Civil Rights and Social Issues (equality, voting rights, race, gender)",
    "Education (schools, universities, student loans)",
    "Environment and Energy (climate change, renewable energy, pollution)",
    "Supreme Court and Judicial Appointments",
    "Gun Control (firearm laws, background checks)",
    "National Infrastructure (roads, bridges, transportation)",
    "Social Welfare Programs (social security, unemployment benefits)",
    "Other (e.g., greetings, procedural statements, moderator acknowledgments, transitions)"
]

In [11]:
def classify_single_statement(statement, labels, model, tokenizer, top_k=2):
    hypotheses = [f"This statement is about {label}." for label in labels]
    
    inputs = tokenizer(
        [statement] * len(hypotheses), 
        hypotheses, 
        return_tensors='pt', 
        truncation=True, 
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)
    
    # Assume entailment is class 0
    entailment_scores = outputs.logits[:, 0]
    top_indices = entailment_scores.topk(top_k).indices.tolist()
    
    return [labels[i] for i in top_indices]

In [12]:
example = "We need to bring back American jobs and support small businesses."
predicted_topics = classify_single_statement(example, labels, model, tokenizer)
print(predicted_topics)

['Economy (jobs, growth, trade)', 'Civil Rights and Social Issues (equality, voting rights, race, gender)']


In [13]:
example = "Thank you, Mr. Vice President."
predicted_topics = classify_single_statement(example, labels, model, tokenizer)
print(predicted_topics)

['Foreign Policy and National Security', 'Other (e.g., greetings, procedural statements, moderator acknowledgments, transitions)']
