In [1]:
import pandas as pd
import re

In [2]:
file_name = '../data/filtered_events_country_code.csv'

df = pd.read_csv(file_name, delimiter=',')

In [3]:
target_events = [
    "Protest with intervention",
    "Violent demonstration",
    "Mob violence"
]

violent_notes = df[
    df['notes'].str.contains('violent', case=False, na=False) &
    df['sub_event_type'].isin(target_events)
]

print("Notes that contain the word violent but were not classified as violent: \n")
for note in violent_notes['notes']:
    print(note)
    print('---')

Notes that contain the word violent but were not classified as violent: 

On 15 May 2025, in the evening, around 1,100 people, including The Left, Antifa and the DKP gathered for a pro-Palestinian demonstration in Berlin - Friedrichshain-Kreuzberg (Berlin) against Israeli action in Gaza and in remembrance of the Nakba Day marking the displacement of Palestinians in 1948, including calls for freedom for Palestine. The demonstration turned violent, including throwing bottles, cans, and red paint at police, resulting in a police officer being seriously injured after being dragged into the crowd, trampled, and hospitalized with a broken arm, while several other officers also sustained injuries. Between 30 to 56 demonstrators were arrested due to various offenses and the police dissolved the event. Around 15 people gathered for a counter-demonstration nearby, waving Israeli flags and holding banners against Hamas.
---
On 10 May 2025, in the afternoon, at the call of the 9 May Committee (Com

In [4]:
fatality_count = df['fatalities'].value_counts()
print(fatality_count)

fatalities
0    183075
2         3
1         1
3         1
Name: count, dtype: int64


In [5]:
disorder_counts = df['sub_event_type'].value_counts()

print(disorder_counts, "\n")

sub_event_type
Peaceful protest                      174013
Protest with intervention               4673
Violent demonstration                   4320
Excessive force against protesters        74
Name: count, dtype: int64 



In [6]:
target_events = [
    "Violent demonstration",
    "Mob violence"
    #"Protest with intervention"
]

df['country_code'] = df['event_id_cnty'].astype(str).str[:3]
country_code_counts = df['country_code'].value_counts()

total_events = country_code_counts.sum()
country_percentages = (country_code_counts / total_events) * 100

violent_df = df[df['sub_event_type'].isin(target_events)].copy()
violent_df['country_code'] = violent_df['event_id_cnty'].astype(str).str[:3]

violent_counts = violent_df['country_code'].value_counts()

summary = pd.DataFrame({
    'total_protests': country_code_counts,
    'percentage_of_all': country_percentages.round(2),
    'violent_protests': violent_counts
})

summary['violent_protests'] = summary['violent_protests'].fillna(0).astype(int)

summary['percent_violent'] = (
    summary['violent_protests'] / summary['total_protests'] * 100
).round(2)
summary = summary.sort_values(by='percent_violent', ascending=False)


print("Total protests per country not noted as peaceful protests:\n")
print(summary.to_string())


Total protests per country not noted as peaceful protests:

              total_protests  percentage_of_all  violent_protests  percent_violent
country_code                                                                      
GRC                     4670               2.55               582            12.46
ALB                     1251               0.68               134            10.71
FRA                    38774              21.18              1421             3.66
NLD                     4380               2.39               141             3.22
CYP                     1784               0.97                48             2.69
CHE                     1041               0.57                28             2.69
DEU                    23792              13.00               569             2.39
AUT                     2033               1.11                46             2.26
GBR                     8210               4.48               176             2.14
ESP                    2076

In [7]:
actor_counts = df['assoc_actor_1'].dropna().value_counts()
total_actor_mentions = actor_counts.sum()
actor_percentages = (actor_counts / total_actor_mentions) * 100

violent_actors = violent_df['assoc_actor_1'].dropna().value_counts()

actor_summary = pd.DataFrame({
    'total_protests': actor_counts,
    'percentage_of_all': actor_percentages.round(2),
    'violent_protests': violent_actors
})

actor_summary['violent_protests'] = actor_summary['violent_protests'].fillna(0).astype(int)
actor_summary['percent_violent'] = (
    actor_summary['violent_protests'] / actor_summary['total_protests'] * 100
).round(2)

actor_summary.index = actor_summary.index.str.slice(0, 50)

actor_summary = actor_summary.sort_values(by='total_protests', ascending=False)

print("Total protests per actor not noted as peaceful:\n")
print(actor_summary.to_string())

Total protests per actor not noted as peaceful:

                                                    total_protests  percentage_of_all  violent_protests  percent_violent
assoc_actor_1                                                                                                           
Labor Group (France)                                          3540               2.61                94             2.66
XR: Extinction Rebellion                                      3202               2.36                10             0.31
Labor Group (Spain)                                           2906               2.14                77             2.65
Labor Group (Italy)                                           2462               1.82                11             0.45
FFF: Fridays for Future; Students (Sweden)                    2383               1.76                 0             0.00
CGT: General Confederation of Labor (France); Labo            2024               1.49                35 

In [8]:
event_types = df['event_type'].value_counts()
print(event_types)

event_type
Protests    178760
Riots         4320
Name: count, dtype: int64


In [9]:
interaction_types = df['interaction'].value_counts()
print(interaction_types)

interaction
60    171518
16      4345
15      2798
66      2500
50      1068
56       332
55       212
57       188
68        74
58        31
36        13
35         1
Name: count, dtype: int64


In [10]:
df['country_code'] = df['event_id_cnty'].astype(str).str[:3]

country_counts = df['country_code'].value_counts()
total_events = country_counts.sum()
country_percentages = (country_counts / total_events) * 100

riots_df = df[df['event_type'] == 'Riots'].copy()
riots_df['country_code'] = riots_df['event_id_cnty'].astype(str).str[:3]

riot_counts = riots_df['country_code'].value_counts()

summary = pd.DataFrame({
    'total_protests': country_counts,
    'percentage_of_all': country_percentages.round(2),
    'riot_protests': riot_counts
})

summary['riot_protests'] = summary['riot_protests'].fillna(0).astype(int)

summary['percent_riots'] = (
    summary['riot_protests'] / summary['total_protests'] * 100
).round(2)

summary = summary.sort_values(by='percent_riots', ascending=False)

print("Riots per country:\n")
print(summary.to_string())


Riots per country:

              total_protests  percentage_of_all  riot_protests  percent_riots
country_code                                                                 
GRC                     4670               2.55            582          12.46
ALB                     1251               0.68            134          10.71
FRA                    38774              21.18           1421           3.66
NLD                     4380               2.39            141           3.22
CYP                     1784               0.97             48           2.69
CHE                     1041               0.57             28           2.69
DEU                    23792              13.00            569           2.39
AUT                     2033               1.11             46           2.26
GBR                     8210               4.48            176           2.14
ESP                    20764              11.34            396           1.91
ITA                    25436              13

In [11]:
actor_counts = df['assoc_actor_1'].dropna().value_counts()
total_actor_mentions = actor_counts.sum()
actor_percentages = (actor_counts / total_actor_mentions) * 100

riots_df = df[df['event_type'] == 'Riots']
riot_actor_counts = riots_df['assoc_actor_1'].dropna().value_counts()

actor_summary = pd.DataFrame({
    'total_protests': actor_counts,
    'percentage_of_all': actor_percentages.round(2),
    'riot_protests': riot_actor_counts
})

actor_summary['riot_protests'] = actor_summary['riot_protests'].fillna(0).astype(int)
actor_summary['percent_riots'] = (
    actor_summary['riot_protests'] / actor_summary['total_protests'] * 100
).round(2)

actor_summary.index = actor_summary.index.str.slice(0, 50)

actor_summary = actor_summary.sort_values(by='total_protests', ascending=False)

print("Riots per actor:\n")
print(actor_summary.to_string())


Riots per actor:

                                                    total_protests  percentage_of_all  riot_protests  percent_riots
assoc_actor_1                                                                                                      
Labor Group (France)                                          3540               2.61             94           2.66
XR: Extinction Rebellion                                      3202               2.36             10           0.31
Labor Group (Spain)                                           2906               2.14             77           2.65
Labor Group (Italy)                                           2462               1.82             11           0.45
FFF: Fridays for Future; Students (Sweden)                    2383               1.76              0           0.00
CGT: General Confederation of Labor (France); Labo            2024               1.49             35           1.73
CFDT: French Democratic Confederation of Labor; CF    

In [12]:
df = pd.read_csv('../data/filtered_events_country_code.csv')

def conditional_bracket_removal(s):
    s = str(s).strip()
    s_no_brackets = re.sub(r'\(.*?\)', '', s).strip()
    if len(s_no_brackets) <= 12:
        return s_no_brackets
    else:
        return s

if 'assoc_actor_1' in df.columns:
    df['assoc_actor_1'] = df['assoc_actor_1'].apply(conditional_bracket_removal)

df.to_csv('../data/filtered_events_country_code_brackets_removed.csv', index=False)


In [16]:
df = pd.read_csv('../data/filtered_events_country_code.csv')

def conditional_bracket_removal(s):
    s = str(s).strip()
    s_no_brackets = re.sub(r'\(.*?\)', '', s).strip()
    return s_no_brackets

if 'assoc_actor_1' in df.columns:
    df['assoc_actor_1'] = df['assoc_actor_1'].apply(conditional_bracket_removal)

def split_rows_on_actors(df, actor_col='assoc_actor_1', delimiter=';'):
    df_expanded = df.copy()
    df_expanded[actor_col] = df_expanded[actor_col].fillna('').astype(str)

    df_expanded[actor_col] = df_expanded[actor_col].apply(lambda x: [a.strip() for a in x.split(delimiter)] if x else [''])

    df_exploded = df_expanded.explode(actor_col).reset_index(drop=True)
    return df_exploded

df_split = split_rows_on_actors(df)

df_split.to_csv('../data/filtered_events_country_code_actor_split.csv', index=False)

In [14]:
import pandas as pd
import numpy as np
import re

# Load your dataset
# df = pd.read_csv('your_dataset.csv')  # Uncomment and replace with actual path

# Assign violent = 1 for violent event types, else 0
violent_keywords = ['Violent demonstration', 'Riots', 'Armed conflict', 'Mob violence']
df['violent'] = df['event_type'].isin(violent_keywords).astype(int)

df.to_csv('../data/violent_bool.csv', index=False)

# -------------------------------------------------------------------
# Country-level violence summary
df['country_code'] = df['event_id_cnty'].astype(str).str[:3]

violent_counts = df[df['violent'] == 1]['country_code'].value_counts()
non_violent_counts = df[df['violent'] == 0]['country_code'].value_counts()
total_counts = df['country_code'].value_counts()

country_counts = pd.DataFrame({
    'violent': violent_counts,
    'non_violent': non_violent_counts,
    'total': total_counts
}).fillna(0)

country_counts['violent_percentage'] = (
    country_counts['violent'] / (country_counts['violent'] + country_counts['non_violent']) * 100
)
country_counts.to_csv('../data/violent_per_country.csv')

# -------------------------------------------------------------------
# Actor-level violence analysis

# Drop rows with missing actors
df = df.dropna(subset=['assoc_actor_1'])

# Split multiple actors per row
df['assoc_actor_1'] = df['assoc_actor_1'].str.split(';')
df = df.explode('assoc_actor_1')

# Define function to conditionally remove brackets
def conditional_bracket_removal(s):
    s = str(s).strip()
    s_no_brackets = re.sub(r'\(.*?\)', '', s).strip()
    if len(s_no_brackets) <= 12:
        return s_no_brackets
    else:
        return s

# Clean actor names
df['assoc_actor_1'] = df['assoc_actor_1'].apply(conditional_bracket_removal)
df['assoc_actor_1'] = df['assoc_actor_1'].str.strip()
df = df[df['assoc_actor_1'] != '']  # Drop empty results

# Add 'country' column from first 3 characters of event_id_cnty
df['country'] = df['event_id_cnty'].astype(str).str[:3]

# Group by both actor and country
grouped = df.groupby(['assoc_actor_1', 'country'])

violent_actor_counts = grouped['violent'].sum()
total_actor_counts = grouped['violent'].count()
non_violent_actor_counts = total_actor_counts - violent_actor_counts

# Create final DataFrame
actor_counts = pd.DataFrame({
    'violent': violent_actor_counts,
    'non_violent': non_violent_actor_counts,
    'total': total_actor_counts
}).reset_index()

actor_counts['violent_percentage'] = (
    actor_counts['violent'] / (actor_counts['violent'] + actor_counts['non_violent']) * 100
)

# Sort by total
actor_counts = actor_counts.sort_values(by='total', ascending=False)

# Save and display
actor_counts.to_csv('../data/violent_per_actor.csv', index=False)

from IPython.display import display, HTML
display(HTML(actor_counts.to_html(max_rows=1000, max_cols=20, notebook=True)))


Unnamed: 0,assoc_actor_1,country,violent,non_violent,total,violent_percentage
2216,Labor Group,FRA,404,19213,19617,2.059438
555,CGT: General Confederation of Labor,FRA,236,11821,12057,1.957369
2223,Labor Group,ITA,71,10696,10767,0.659422
4308,,DEU,348,10072,10420,3.339731
2213,Labor Group,ESP,138,7529,7667,1.799922
3546,SUD: Democratic Unitary Solidarity,FRA,178,7251,7429,2.396016
4313,,FRA,720,6005,6725,10.706320
1147,FO: Workers' Force,FRA,161,6387,6548,2.458766
1159,FSU: United Federation of Trade Unions,FRA,164,6184,6348,2.583491
545,CFDT: French Democratic Confederation of Labor,FRA,153,5269,5422,2.821837


In [15]:
# Save and display
actor_counts.to_csv('../data/violent_per_actor.csv')

from IPython.display import display, HTML
display(HTML(actor_counts.to_html(max_rows=1000, max_cols=20, notebook=True)))

Unnamed: 0,assoc_actor_1,country,violent,non_violent,total,violent_percentage
2216,Labor Group,FRA,404,19213,19617,2.059438
555,CGT: General Confederation of Labor,FRA,236,11821,12057,1.957369
2223,Labor Group,ITA,71,10696,10767,0.659422
4308,,DEU,348,10072,10420,3.339731
2213,Labor Group,ESP,138,7529,7667,1.799922
3546,SUD: Democratic Unitary Solidarity,FRA,178,7251,7429,2.396016
4313,,FRA,720,6005,6725,10.706320
1147,FO: Workers' Force,FRA,161,6387,6548,2.458766
1159,FSU: United Federation of Trade Unions,FRA,164,6184,6348,2.583491
545,CFDT: French Democratic Confederation of Labor,FRA,153,5269,5422,2.821837
