In [None]:
from tqdm import tqdm
import pandas as pd
from transformers import pipeline

classifier = pipeline("zero-shot-classification", 
                      model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", 
                      device=0,
                      batch_size=8)
'''
candidate_labels = [
    "women rights",
    "climate",
    "labor rights",
    "health care",
    "farmers",
    "environment",
    "public services",
    "palestine-israel conflict",
    "immigration",
    "unjust law enforcement",
    "ukraine-russia war",
    "discrimination",
    "education",
    "housing",
    "culture",
    "policies",
    "animal welfare",
    "pandemic",
    "lgbtq"
]
'''
candidate_labels =[
    "women rights: Protests advocating for gender equality, reproductive rights, protection from gender-based violence, equal pay, and representation in politics and leadership roles.",
    "climate: Demonstrations focused on climate change, demanding urgent action to reduce carbon emissions, transition to renewable energy, and hold governments or corporations accountable for environmental damage.",
    "labor rights: Protests by workers or labor unions for fair wages, better working conditions, protection from exploitation, the right to organize, and opposition to layoffs or union-busting.",
    "health care: Protests demanding access to affordable and quality health care, opposition to privatization, or calling attention to deficiencies in hospitals, medical services, or health insurance systems.",
    "farmers: Protests by agricultural workers or rural communities over land rights, crop pricing, subsidies, agricultural reforms, or environmental impacts on farming.",
    "environment: Protests against pollution, deforestation, mining, industrial projects, or government inaction that harms natural ecosystems or biodiversity.",
    "public services: Demonstrations in support of better public transport, utilities, postal services, sanitation, or opposition to budget cuts or privatization of essential services.",
    "palestine-israel conflict: Protests addressing the Israeli-Palestinian conflict, including calls for ceasefire, condemnation of military actions, support for Palestinian or Israeli civilians, or broader geopolitical stances.",
    "immigration: Protests concerning immigration policies, refugee treatment, deportation, border enforcement, asylum rights, and xenophobia or anti-immigrant rhetoric.",
    "unjust law enforcement: Demonstrations against police brutality, racial profiling, excessive use of force, lack of accountability, or systemic abuse by law enforcement.",
    "ukraine-russia war: Protests related to the conflict in Ukraine, including opposition to Russian military aggression, support for Ukrainian sovereignty, calls for peace, or criticism of international responses.",
    "discrimination: Protests against racial, religious, ethnic, or disability-based discrimination, including civil rights marches, anti-hate movements, and demands for equality and justice.",
    "education: Demonstrations for accessible, equitable, and quality education; opposition to budget cuts, tuition hikes, or curriculum changes; or support for teachers and students.",
    "housing: Protests calling for affordable housing, rent control, opposition to evictions or gentrification, or action on homelessness and housing insecurity.",
    "culture: Protests about cultural preservation, opposition to censorship or erasure of indigenous or minority identities, or resistance to cultural appropriation or defunding of arts programs.",
    "policies: Protests targeting government or institutional policies, including new laws, reforms, or administrative decisions perceived as harmful, unjust, or controversial.",
    "animal welfare: Demonstrations advocating for animal rights, opposing factory farming, animal testing, abuse, or calling for stronger animal protection laws and ethical treatment.",
    "pandemic: Protests related to COVID-19 or other public health crises, including lockdowns, vaccine mandates, health measures, or economic relief demands.",
    "lgbtq: Protests for LGBTQ rights, including marriage equality, anti-discrimination protections, trans rights, and opposition to anti-LGBTQ legislation or rhetoric."
]

df = pd.read_csv('../data/filtered_events_country_code.csv')
texts = df['notes'].tolist()

batch_size = 8
predictions = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    results = classifier(batch, candidate_labels)
    predictions.extend([res['labels'][0] for res in results])

df['predicted_topic'] = predictions
df['predicted_topic'] = df['predicted_topic'].str.split(':').str[0]
df.to_csv('../data/topics_zsc.csv', index=False)

Device set to use cuda:0
  0%|          | 0/22885 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 10/22885 [00:07<4:20:31,  1.46it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 22885/22885 [3:45:18<00:00,  1.69it/s]  


In [2]:
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio
init_notebook_mode(True)

fig = px.bar(x=df["predicted_topic"].value_counts().index,y=df["predicted_topic"].value_counts(),color=df["predicted_topic"].value_counts().index,text=df["predicted_topic"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [3]:
df_true = pd.read_csv("../data/topics_evaluation.csv")

df_merged = pd.merge(df_true, df[['notes', 'predicted_topic']], on='notes', how='inner')


from sklearn.metrics import classification_report, accuracy_score

df_eval = df_merged[df_merged['class'] != 'unknown']

y_true = df_eval['class']
y_pred = df_eval['predicted_topic']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))

Accuracy: 0.47144948755490484

Classification Report:
                            precision    recall  f1-score   support

           animal welfare       0.11      0.86      0.19         7
                  climate       0.81      0.86      0.83        29
                  culture       0.29      0.78      0.42         9
           discrimination       0.00      0.00      0.00         3
                education       0.39      0.85      0.54        53
              environment       0.64      0.29      0.40        24
                  farmers       0.97      0.62      0.76        53
              health care       0.38      0.52      0.44        27
                  housing       0.62      0.62      0.62         8
              immigration       0.07      1.00      0.14         2
             labor rights       0.81      0.62      0.70       205
                    lgbtq       0.10      0.67      0.17        12
palestine-israel conflict       0.72      0.57      0.64        40
      


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [4]:
print(df_eval)

                                                 notes               class  \
0    On 17 November 2022, doctors and nurses affili...         health care   
1    On 13 January 2022, at the call of eight teach...           education   
2    On 24 August 2022, Ukrainian activists gathere...  ukraine-russia war   
3    On 17 November 2020, students gathered outside...            pandemic   
4    On 2 May 2020, members of the newly-born movem...         health care   
..                                                 ...                 ...   
678  On 24 June 2022, an unknown number of farmers ...             farmers   
679  On 27 November 2024, Diageo workers staged a p...        labor rights   
680  On 20 June 2023, cleaning, catering, security ...        labor rights   
681  On 17 October 2024, around 30 undocumented mig...             housing   
682  On 25 May 2020, law enforcers in Slupia stoppe...            pandemic   

    real class         predicted_topic  
0      unknown        