In [None]:
import pandas as pd
dataset = pd.read_csv('../data/filtered_events_class.csv')
dataset = dataset[['class', 'clean_notes']]
dataset = dataset[dataset['class'] != 'NoN']

In [37]:
dataset["class"].value_counts()

class
labor rights                 29841
education                    22662
culture                       7754
environment                   6881
palestine-israel conflict     6147
ukraine-russia war            5585
infrastructure                5427
climate                       4699
health care                   4007
political                     3952
women rights                  3408
farmers                       3266
unjust law enforcement        2217
democracy                     1924
animal welfare                1353
public services               1067
discrimination                 962
eviction                       828
immigration                    603
tourism                        444
social welfare                 330
anti crime                     116
skip                            91
local policy                    35
housing                         33
cyprus conflict                 31
sudan conflict                  26
youth violence                   8
economic justi

In [38]:
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import plotly.io as pio

init_notebook_mode(True)

fig = px.bar(x=dataset["class"].value_counts().index,y=dataset["class"].value_counts(),color=dataset["class"].value_counts().index,text=dataset["class"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [39]:
fig = px.bar(x=dataset["class"].value_counts().index,y=dataset["class"].value_counts(),color=dataset["class"].value_counts().index,text=dataset["class"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [None]:
class_mapping = {
    'labor rights': 'Labor Rights',
    'education': 'Education',
    'culture': 'Culture',
    'Environment': 'Environment',
    'palestine-israel conflict': 'Palestine-Israel Conflict',
    'cyprus conflict': 'Palestine-Israel Conflict',
    'sudan conflict': 'Palestine-Israel Conflict',
    'ukraine-russia war': 'Ukraine-Russia War',
    "military": "Ukraine-Russia War",
    'infrastructure': 'Infrastructure',
    'political rights': 'Political & Democratic Governance',
    'political': 'Political & Democratic Governance',
    'political criticism': 'Political & Democratic Governance',
    'anti right wing': 'Political & Democratic Governance',
    'democracy': 'Political & Democratic Governance',
    'local policy': 'Political & Democratic Governance',
    'health care': 'Public Services & Social Welfare',
    'public services': 'Public Services & Social Welfare',
    'social welfare': 'Public Services & Social Welfare',
    'housing': 'Public Services & Social Welfare',
    'eviction': 'Public Services & Social Welfare',
    'women rights': 'Justice & Civil Rights',
    'discrimination': 'Justice & Civil Rights',
    'unjust law enforcement': 'Justice & Civil Rights',
    'economic justice': 'Justice & Civil Rights',
    'anti crime': 'Justice & Civil Rights',
    'youth violence': 'Justice & Civil Rights',
    'climate': 'Climate Action & Animal Welfare',
    'animal welfare': 'Climate Action & Animal Welfare',
    'farmers': 'Climate Action & Resource Management',
    'immigration': 'Climate Action & Resource Management',
    'tourism': 'Climate Action & Resource Management',
    'skip': 'Climate Action & Resource Management',
    'M': 'M',
}

def merge_classes(df, class_mapping):
    df['class'] = df['class'].replace(class_mapping)
    return df

In [41]:
dataset = merge_classes(dataset, class_mapping)

def keep_first_n(df, n):
    return df.groupby('class').head(n)
dataset = keep_first_n(dataset, 6000)


In [42]:
fig = px.bar(x=dataset["class"].value_counts().index,y=dataset["class"].value_counts(),color=dataset["class"].value_counts().index,text=dataset["class"].value_counts())
fig.update_traces(hovertemplate="Category:'%{x}' Counted: %{y}")
fig.update_layout(title={"text":"Category Counts","x":0.5,"font":{"size":35}},xaxis={"title":"Category","showgrid":False},yaxis={"title":"Value","showgrid":False},plot_bgcolor="white",width=800,height=500,showlegend=False)
iplot(fig)

In [None]:
data_for_sunburst = []

original_topic_counts = {
    'labor rights': 29841,
    'education': 22958,
    'culture': 7818,
    'environment': 6881,
    'palestine-israel conflict': 6147,
    'ukraine-russia war': 5690,
    'infrastructure': 5427,
    'climate': 4699,
    'health care': 4007,
    'women rights': 3408,
    'farmers': 3266,
    'anti right wing': 2466,
    'democracy': 2350,
    'unjust law enforcement': 2235,
    'animal welfare': 1353,
    'public services': 1070,
    'discrimination': 1004,
    'eviction': 832,
    'immigration': 688,
    'tourism': 444,
    'social welfare': 331,
    'political rights': 321,
    'political criticism': 120,
    'anti crime': 117,
    'skip': 91,
    'local policy': 35,
    'housing': 33,
    'cyprus conflict': 31,
    'sudan conflict': 26,
    'youth violence': 8,
    'economic justice': 8,
    'military': 2
}


for original_topic, count in original_topic_counts.items():
    mapped_key = original_topic
    if original_topic == 'environment' and 'Environment' in class_mapping:
        mapped_key = 'Environment'

    if mapped_key in class_mapping:
        parent_category = class_mapping[mapped_key]
        data_for_sunburst.append({
            'Category': parent_category,
            'Subcategory': original_topic,
            'Count': count
        })
    else:
        print(f"Warning: Original topic '{original_topic}' not found in class_mapping. It will not be plotted.")

df_sunburst = pd.DataFrame(data_for_sunburst)

fig = px.sunburst(
    df_sunburst,
    path=['Category', 'Subcategory'],
    values='Count',
    title='Hierarchical Distribution of Topic Categories',
    width=800,
    height=700,
    hover_data={'Count': ':.0f'} 
)

fig.update_layout(
    title={
        "text": "Hierarchical Distribution of Topic Categories",
        "x": 0.5,
        "font": {"size": 25}
    },
    margin=dict(t=50, l=0, r=0, b=0) 
)

fig.show()

In [46]:
dataset.to_csv('../data/filtered_events_class.csv', index=False)