In [1]:
from tqdm import tqdm
import pandas as pd
from transformers import pipeline

classifier = pipeline("zero-shot-classification", 
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", 
                      device=0,
                      batch_size=8)

candidate_labels = [
    "women rights",
    "climate",
    "labor rights",
    "health care",
    "farmers",
    "environment",
    "public services",
    "palestine-israel conflict",
    "immigration",
    "unjust law enforcement",
    "ukraine-russia war",
    "discrimination",
    "education",
    "housing",
    "culture",
    "policies",
    "animal welfare",
    "pandemic",
    "lgbtq"
]

df = pd.read_csv('../data/filtered_events_country_code.csv')
texts = df['notes'].tolist()

batch_size = 8
predictions = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    results = classifier(batch, candidate_labels)
    predictions.extend([res['labels'][0] for res in results])

df['predicted_topic'] = predictions
df.to_csv('../data/topics_zsc.csv', index=False)

Device set to use cuda:0
  0%|          | 0/22885 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 10/22885 [00:07<4:27:57,  1.42it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 22885/22885 [3:01:31<00:00,  2.10it/s]  


In [2]:
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio
init_notebook_mode(True)

fig = px.bar(x=df["predicted_topic"].value_counts().index,y=df["predicted_topic"].value_counts(),color=df["predicted_topic"].value_counts().index,text=df["predicted_topic"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [10]:
df_true = pd.read_csv("../data/topics_evaluation.csv")

df_merged = pd.merge(df_true, df[['notes', 'predicted_topic']], on='notes', how='inner')


from sklearn.metrics import classification_report, accuracy_score

df_eval = df_merged[df_merged['class'] != 'unknown']

y_true = df_eval['class']
y_pred = df_eval['predicted_topic']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))

Accuracy: 0.49194729136163984

Classification Report:
                            precision    recall  f1-score   support

           animal welfare       0.75      0.86      0.80         7
                  climate       0.00      0.00      0.00        29
                  culture       0.31      0.44      0.36         9
           discrimination       0.05      0.33      0.08         3
                education       0.69      0.64      0.67        53
              environment       0.30      0.75      0.43        24
                  farmers       0.91      0.92      0.92        53
              health care       0.25      0.70      0.37        27
                  housing       0.18      0.88      0.30         8
              immigration       0.00      0.00      0.00         2
             labor rights       0.90      0.34      0.49       205
                    lgbtq       0.50      0.33      0.40        12
palestine-israel conflict       0.91      0.75      0.82        40
      


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [11]:
print(df_eval)

                                                 notes               class  \
0    On 17 November 2022, doctors and nurses affili...         health care   
1    On 13 January 2022, at the call of eight teach...           education   
2    On 24 August 2022, Ukrainian activists gathere...  ukraine-russia war   
3    On 17 November 2020, students gathered outside...            pandemic   
4    On 2 May 2020, members of the newly-born movem...         health care   
..                                                 ...                 ...   
678  On 24 June 2022, an unknown number of farmers ...             farmers   
679  On 27 November 2024, Diageo workers staged a p...        labor rights   
680  On 20 June 2023, cleaning, catering, security ...        labor rights   
681  On 17 October 2024, around 30 undocumented mig...             housing   
682  On 25 May 2020, law enforcers in Slupia stoppe...            pandemic   

    real class     predicted_topic  
0      unknown         hea