In [None]:
from tqdm import tqdm
import pandas as pd
from transformers import pipeline
import os

classifier = pipeline("zero-shot-classification", 
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", 
                      device=0,
                      batch_size=8)

candidate_labels = [
    "women rights",
    "climate",
    "labor rights",
    "health care",
    "farmers",
    "environment",
    "public services",
    "palestine-israel conflict",
    "immigration",
    "unjust law enforcement",
    "ukraine-russia war",
    "discrimination",
    "education",
    "housing",
    "culture",
    "policies",
    "animal welfare",
    "pandemic",
    "lgbtq"
]

df = pd.read_csv('../data/filtered_events_country_code.csv')
texts = df['notes'].tolist()

batch_size = 8
predictions = []

output_path = '../data/predicted_topics_progress.csv'

if os.path.exists(output_path):
    df_done = pd.read_csv(output_path)
    start_idx = len(df_done)
    predicted_topics = df_done['predicted_topic'].tolist()
    print(f"Resuming from row {start_idx}...")
else:
    df_done = pd.DataFrame()
    start_idx = 0
    predicted_topics = []


for i in tqdm(range(start_idx, len(texts), batch_size), desc="Classifying", unit="batch"):
    batch_texts = texts[i:i + batch_size]
    results = classifier(batch_texts, candidate_labels, multi_label=False)


    if isinstance(results, dict):
        batch_preds = [results['labels'][0]]
    else:
        batch_preds = [r['labels'][0] for r in results]

    predicted_topics.extend(batch_preds)

    df_batch = df.iloc[start_idx:i + batch_size].copy()
    df_batch = df_batch.iloc[:len(batch_preds)]  # ensure same size
    df_batch['predicted_topic'] = batch_preds

    if i == start_idx:
        df_batch.to_csv(output_path, index=False)
    else:
        df_batch.to_csv(output_path, mode='a', index=False, header=False)

    start_idx = i + batch_size

Device set to use cuda:0


Resuming from row 8...


Classifying:   0%|          | 0/22884 [00:00<?, ?batch/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Classifying:  75%|███████▌  | 17240/22884 [2:25:48<57:17,  1.64batch/s]  

In [None]:
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio
init_notebook_mode(True)

fig = px.bar(x=df["predicted_topic"].value_counts().index,y=df["predicted_topic"].value_counts(),color=df["predicted_topic"].value_counts().index,text=df["predicted_topic"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [None]:
df_true = pd.read_csv("'../data/topics_evaluation.csv'")

df_merged = pd.merge(df_true, df[['clean_notes', 'predicted_topic']], on='clean_notes', how='inner')


from sklearn.metrics import classification_report, accuracy_score

df_eval = df_merged[df_merged['true_topic'] != 'unknown']

y_true = df_eval['class']
y_pred = df_eval['predicted_topic']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))