In [1]:
from tqdm import tqdm
import pandas as pd
from transformers import pipeline


df_true = pd.read_csv("../data/labeled.csv")

df = df_true[df_true['class'] != 'unknown']
print(df)
print(df["class"].value_counts())

        Unnamed: 0 event_id_cnty       event_date  year  time_precision  \
0                0       BEL4179      23 May 2025  2025               1   
1                1       BEL4183      23 May 2025  2025               1   
2                2       BGR4378      23 May 2025  2025               1   
3                3       BGR4379      23 May 2025  2025               1   
4                4       BGR4380      23 May 2025  2025               1   
...            ...           ...              ...   ...             ...   
183058      183058          ROU2  08 January 2018  2018               1   
183059      183059       GRC2059  08 January 2018  2018               1   
183063      183063          HRV1  07 January 2018  2018               1   
183069      183069          BGR2  04 January 2018  2018               2   
183077      183077          BGR1  03 January 2018  2018               1   

         disorder_type event_type             sub_event_type  \
0       Demonstrations   Protests  

  df_true = pd.read_csv("../data/labeled.csv")


In [2]:
classifier = pipeline("zero-shot-classification", 
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", 
                      device=0,
                      batch_size=8)

candidate_labels = [
    "women rights",
    "climate",
    "labor rights",
    "health care",
    "farmers",
    "environment",
    "public services",
    "palestine-israel conflict",
    "immigration",
    "unjust law enforcement",
    "ukraine-russia war",
    "discrimination",
    "education",
    "housing",
    "culture",
    "policies",
    "animal welfare",
    "pandemic",
    "lgbtq"
]

texts = df['notes'].tolist()

batch_size = 8
predictions = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    results = classifier(batch, candidate_labels)
    predictions.extend([res['labels'][0] for res in results])

df['predicted_topic'] = predictions
df['predicted_topic'] = df['predicted_topic'].str.split(':').str[0]
df.to_csv('../data/topics_zsc.csv', index=False)

Device set to use cuda:0
  0%|          | 0/15312 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 10/15312 [00:06<2:27:19,  1.73it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 15312/15312 [2:09:50<00:00,  1.97it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_topic'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pred

In [3]:
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio
init_notebook_mode(True)

fig = px.bar(x=df["predicted_topic"].value_counts().index,y=df["predicted_topic"].value_counts(),color=df["predicted_topic"].value_counts().index,text=df["predicted_topic"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [4]:
df_true = pd.read_csv("../data/labeled.csv")

df_merged = pd.merge(df_true, df[['notes', 'predicted_topic']], on='notes', how='inner')


from sklearn.metrics import classification_report, accuracy_score

df_eval = df_merged[df_merged['class'] != 'unknown']

y_true = df_eval['class']
y_pred = df_eval['predicted_topic']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))


Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.



Accuracy: 0.4991591424886117

Classification Report:
                            precision    recall  f1-score   support

           animal welfare       0.82      0.81      0.82      1336
                  climate       0.56      0.02      0.04      5569
                  culture       0.29      0.43      0.35      1508
           discrimination       0.11      0.63      0.18       763
                education       0.61      0.68      0.64      8468
              environment       0.33      0.81      0.47      4483
                  farmers       0.95      0.91      0.93      9668
              health care       0.25      0.63      0.36      5174
                  housing       0.17      0.92      0.29       820
              immigration       0.21      0.32      0.25       720
             labor rights       0.91      0.30      0.46     36309
                    lgbtq       0.70      0.52      0.60      1681
palestine-israel conflict       0.96      0.71      0.82      7425
       

In [5]:
print(df_eval)

        Unnamed: 0 event_id_cnty       event_date  year  time_precision  \
0                0       BEL4179      23 May 2025  2025               1   
1                1       BEL4183      23 May 2025  2025               1   
2                2       BGR4378      23 May 2025  2025               1   
3                3       BGR4379      23 May 2025  2025               1   
4                4       BGR4380      23 May 2025  2025               1   
...            ...           ...              ...   ...             ...   
122489      183058          ROU2  08 January 2018  2018               1   
122490      183059       GRC2059  08 January 2018  2018               1   
122491      183063          HRV1  07 January 2018  2018               1   
122492      183069          BGR2  04 January 2018  2018               2   
122493      183077          BGR1  03 January 2018  2018               1   

         disorder_type event_type             sub_event_type  \
0       Demonstrations   Protests  