# IMPORTING

In [93]:
import pandas as pd
import numpy as np
import re
from text_to_num import text2num
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Loading Data

In [68]:
file_path = "/Users/ofekzini/Documents/Data Engineering/Fall 2024/ויזואליזציה/Project/Data/Protest_Classification_Results.csv"
data = pd.read_csv(file_path)
# data

In [70]:
data['tags'] = data['tags'].str.replace(r'crowd size=', '', regex=True).str.strip()
# data

In [80]:
data_dict = {
    'roughly 35': 35,
    'no report': None,
    'counter-demonstration; no report': None,
    'counter-demonstration; about 104': 104,
    'counter-demonstration; about two dozen to around 100': [24, 100],
    'over 500': 500,
    'about 30': 30,
    '2': 2,
    'hundreds': 200,
    'around 100': 100,
    'around 60': 60,
    'dozens': 36,  # Interpreted as 3 dozen
    'about 100': 100,
    'around 200': 200,
    'counter-demonstration; over 250': 250,
    'at least two dozen': 24,
    'around 40': 40,
    'more than 55': 56,
    'counter-demonstration; hundreds': 200,
    'around 10': 10,
    'large': None,  # Cannot quantify
    'a few dozen': 36,  # 3 dozen assumed
    'handful': 5,
    'more than 80': 81,
    'over 1,000': 1000,
    '1': 1,
    'around 20': 20,
    'counter-demonstration; at least 17': 17,
    'more than 70': 71,
    'about 40': 40,
    'about 50': 50,
    'nearly 50': 49,
    'about 20-30': [20, 30],
    'between 60 and 80': [60, 80],
    'about 15': 15,
    '25': 25,
    'more than 60': 61,
    'over 30': 31,
    'roughly 200': 200,
    'counter-demonstration; more than 50': 51,
    '10': 10,
    'around 25': 25,
    'about 112': 112,
    'counter-demonstration; around 70 to 180': [70, 180],
    'several hundred': 300,
    'counter-demonstration; about 110': 110,
    'over 150': 151,
    'more than 40': 41,
    'more than 100': 101,
    '40': 40,
    'roughly 25 to around 30': [25, 30],
    'about a dozen': 12,
    'roughly a dozen': 12,
    'group of 20 or so': 20,
    'around 30 to 40': [30, 40],
    'a few hundred': 300,
    'counter-demonstration; upwards of 300 to about 500': [300, 500],
    'over 50': 51,
    'five': 5,
    'tens of thousands': 10000,
    'roughly 300': 300,
    'counter-demonstration; between 50 and 100': [50, 100],
    'less than a dozen': 11,
    'counter-demonstration; several hundred': 300,
    '30': 30,
    'more than 150': 151,
    '20': 20,
    'counter-demonstration; about 200': 200,
    'counter-demonstration; more than 30': 31,
    '6': 6,
    'counter-demonstration; nearly 100': 99,
    'roughly 30': 30,
    'counter-demonstration; tens of thousands': 10000,
    'counter-demonstration; over 1,000': 1000,
    'several dozen': 48,  # 4 dozen assumed
    'fewer than 10': 9,
    'small': None,  # Cannot quantify
    'nearly 100': 99,
    'roughly 500': 500,
    'roughly 30-40': [30, 40],
    'less than 50': 49,
    'counter-demonstration; at least 200': 200,
    '20 or so': 20,
    'fewer than 50': 49,
    'counter-demonstration; more than 70': 71,
    'counter-demonstration; about 40': 40,
    'counter-demonstration; more than 100': 101,
    'counter-demonstration; over 500': 500,
    'counter-demonstration; over 300': 300,
    'counter-demonstration; a few dozen': 36,  # 3 dozen assumed
    'counter-demonstration; fewer than 20': 19,
    'counter-demonstration; more than 150': 151,
    'roughly 100': 100,
    '10s': 10,
    'roughly 20': 20,
    'counter-demonstration; small': None,  # Cannot quantify
    'hundreds (counter-demonstration)': 200,
    'few': 3,
    'around 15': 15,
    'counter-demonstration; tens': 10,
    'around 50': 50,
    'counter-demonstration; about 50': 50,
    'at least 20': 20,
    'counter-demonstration; at least 20': 20,
    'large number': None,  # Cannot quantify
    'counter-demonstration; few': 3,
    '30-50': [30, 50],
    '200-300': [200, 300],
    'counter-demonstration; thousands': 2000,
    "counter-demonstration; tens of thousands": 20000,
    'thousands': 2000,
    'several thousand': 2000,
    'counter-demonstration; more than a thousand': 1500,
    'counter-demonstration; thousands': 2000,
    'more than a thousand': 1000,
    'over a thousand': 1000,
    'an estimated thousands': 2000,
    'thousands to tens of thousands': 5000,
    'counter-demonstration; more than a hundred to thousands': 1000,
    'tens of thousands': 10000,
    'about a thousand': 1000,
    'a few thousand': 2000,
    'nearly a thousand': 1000,
    'counter-demonstration; nearly a thousand': 1000,
    'counter-demonstration; several hundred to thousands': 1000,
    'counter-demonstration; thousands': 2000,
    'several thousand': 2000,
    'more than a thousand': 1000,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
    "eleven": 11,
    "twelve": 12,
    "thirteen": 13,
    "fourteen": 14,
    "fifteen": 15,
    "sixteen": 16,
    "seventeen": 17,
    "eighteen": 18,
    "nineteen": 19,
    "twenty": 20,
    "thirty": 30,
    "forty": 40,
    "fifty": 50,
    "sixty": 60,
    "seventy-five": 75,
    "hundred": 100,
    "thousand": 1000,
    "thousands": 1000,
    "tens": 10,
    "tens of thousands": 10000,
    "dozen": 12,
    "half a dozen": 6,
    "couple dozen": 24,
    "few dozen": 36,
    "several dozen": 48,
    "two dozen": 24,
    "three dozen": 36,
    "four dozen": 48,
    "five dozen": 60,
    "two to three dozen": 30,
    "more than two dozen": 30,
    "nearly two dozen": 24,
    "dozen or so": 12,
    "scores": 20,
    "small handful": 3,
    "handful": 5,
    "a handful": 5,
    "small group": 5,
    "sparse": 10,
    "legion": 10000,
    "many": 100,
    "huge": 1000,
    "fairly sizable": 50,
    "sizable": 100,
    "small crowd": 20,
        "roughly a couple dozen": 24,
    "about two dozen": 24,
    "over sixty": 61,
    "close to a dozen": 12,
    "couple hundred": 200,
    "a half-dozen": 6,
    "over two dozen": 25,
    "some three dozen": 36,
    "a few": 3,
    "small": 10,  # Assuming 'small' refers to a small number, setting to 10
    "about twenty": 20,
    "several dozen": 36,
    "over a hundred": 101,
    "about sixty": 60,
    "more than thirty": 31,
    "a hundred or so": 100,
    "more than a hundred": 101,
    "at least seven": 7,
    "around fifty": 50,
    "more than a dozen": 13,
    "around seventy-five": 75,
    "five-dozen": 60,
    "over a dozen": 13,
    "nearly three dozen": 35,
    "a couple dozen": 24,
    "about thirty": 30,
    "a dozen or so": 12,
    "two to three hundred": 250,  # Assuming an average of 250
    "a dozen": 12,
    "more than a hundred to several hundred": 250,  # Assuming an average of 250
    "at least a hundred": 100,
    "a little under a hundred": 99,
    "more than two dozen": 25,
    "around thirty": 30,
    "two hundred plus": 201,
    "around a hundred": 100,
    "roughly one hundred": 100,
    "around a dozen": 12,
    "more than fifty": 51,
    "just over a dozen": 13,
    "a hundred": 100,
    "over one hundred": 101,
    "around two dozen": 24,
    "about three dozen": 36,
    "around one hundred": 100,
    "at least seven": 7,
    "nearly a dozen": 11,
    "about a dozen vehicles": 12,  # Assuming 'vehicles' doesn't change the count
    "several hundred to thousands": 500,  # Assuming an average of 500
    "more than three dozen": 37,
    "about one hundred": 100,
    "roughly two dozen": 24,
    "approximately two dozen": 24,
    "four hundred": 400,
    "about a hundred": 100,
    "about three hundred": 300,
    "about two hundred": 200,
    "approximately three dozen": 36,
    "over a hundred to thousands": 500,  # Assuming an average of 500
    "thousand": 1000,
    "approximately a dozen": 12,
    "around three dozen": 36,
    "around several hundred": 500,
    "a thousand": 1000,
    "a little over a hundred": 101,
    "a couple hundred": 200,
    "sixty or more": 60,
    "several": 5,  # Assuming 'several' refers to 5
    'counter-demonstration; couple hundred': 200,
    'twelve to seventeen': 12,
    'counter-demonstration; some three dozen': 36,
    'counter-demonstration; a few': 5,
    'counter-demonstration; several dozen': 24,
    'counter-demonstration; at least seven': 7,
    'counter-demonstration; a couple hundred': 200,
    'counter-demonstration; a couple dozen': 24,
    'counter-demonstration; a dozen': 12,
    'counter-demonstration; a little under a hundred': 100,
    'counter-demonstration; more than two dozen': 24,
    'one hundred': 100,
    'counter-demonstration; about two hundred': 200,
    'counter-demonstration; thousand': 1000,
    'roughly a hundred': 100

}

In [82]:
data['Crowd_size2'] = data.apply(
    lambda row: data_dict.get(row['tags'], row['Crowd_size']) 
    if pd.isna(row['Crowd_size']) or row['Crowd_size'] in ["NR", ""] 
    else row['Crowd_size'], 
    axis=1
)

In [89]:
# data.tail(100)

In [86]:
# data.to_csv("/Users/ofekzini/Downloads/protests2.2.csv")

In [95]:
data['Crowd_size2'] = data['Crowd_size2'].replace(["", "NR", np.nan], -1)

In [99]:
# Drop the 'Crowd_size' column
data.drop(columns=['Crowd_size'], inplace=True)

# Rename 'Crowd_size2' to 'Crowd_size'
data.rename(columns={'Crowd_size2': 'Crowd_size'}, inplace=True)

In [101]:

data

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,source,source_scale,notes,fatalities,tags,timestamp,Pro Israel,Pro Palestine,Violent,Crowd_size
0,USA74355,08/11/2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States),Protesters,...,Harvard Crimson,Subnational,"On 8 November 2024, roughly 35 pro-Palestinian...",0,roughly 35,1731434806,0,1,0,35
1,USA74437,08/11/2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,Democracy Now!; Popular Resistance,Other-National,"On 8 November 2024, pro-Palestinian protesters...",0,no report,1731434806,0,1,0,-1
2,USA74341,07/11/2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),AMP: American Muslims for Palestine; Muslim Gr...,Protesters,...,Anti-Defamation League; Bergen Record,Subnational-National,"On 7 November 2024, pro-Palestinian protesters...",0,counter-demonstration; no report,1731434806,1,1,0,-1
3,USA74480,07/11/2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States); Women (United States),Protesters,...,Twitter,New media,"On 7 November 2024, students rallied at Virgin...",0,no report,1731434807,0,0,0,-1
4,USA74522,07/11/2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States); Students (United Sta...,Protesters,...,Arizona Public Media,Subnational,"On 7 November 2024, about 100 pro-Palestinian ...",0,counter-demonstration; about 104,1731434807,1,1,0,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5686,USA60877,08/10/2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),ANSWER: Act Now to Stop War and End Racism; CA...,Protesters,...,DC News Now; It's Going Down; Liberation News;...,Other-Subnational,"On 8 October 2023, a pro-Palestinian group of ...",0,hundreds,1729632871,0,1,0,150
5687,USA60878,08/10/2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,Liberation News,Other,"On 8 October 2023, a pro-Palestinian group of ...",0,no report,1729632871,0,1,0,-1
5688,CAN5081,07/10/2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (Canada),,Protesters,...,Toronto Sun,Subnational,"On 7 October 2023, dozens of people held a pro...",0,dozens,1697584399,0,1,0,50
5689,USA60762,07/10/2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,Protest_NYC,Subnational,"On 7 October 2023, a handful of protesters hel...",0,handful,1697584399,0,1,0,5


In [103]:
col_idx = data.columns.get_loc('assoc_actor_1')

# Insert new empty columns
data.insert(col_idx + 1, 'palestenian_group', np.nan)
data.insert(col_idx + 2, 'jewish_group', np.nan)
data.insert(col_idx + 3, 'students_group', np.nan)
data.insert(col_idx + 4, 'teachers_group', np.nan)
data.insert(col_idx + 5, 'women_group', np.nan)
data.insert(col_idx + 6, 'political_group', np.nan)
data.insert(col_idx + 7, 'lgbt_group', np.nan)
data.insert(col_idx + 8, 'other_group', np.nan)

new_columns = [
    'assoc_actor_1', 'palestenian_group', 'jewish_group', 'students_group',
    'teachers_group', 'women_group', 'political_group',
    'lgbt_group', 'other_group'
]

def map_groups(row):
    # Check if 'assoc_actor_1' is a valid string
    if isinstance(row['assoc_actor_1'], str):
        # Split the 'assoc_actor_1' into individual groups
        groups = row['assoc_actor_1'].lower().split('; ')

        # Initialize values for all new columns
        row['palestenian_group'] = int(any('palesti' in group or 'muslim' in group for group in groups))
        row['jewish_group'] = int(any('jew' in group for group in groups))
        row['students_group'] = int(any('student' in group for group in groups))
        row['teachers_group'] = int(any('teacher' in group for group in groups))
        row['women_group'] = int(any('women' in group for group in groups))
        row['political_group'] = int(any(group in ['democratic', 'republican'] for group in groups))
        row['lgbt_group'] = int(any('lgbt' in group for group in groups))

        # If no other group matches, 'other_group' gets 1
        row['other_group'] = int(
            not any([
                row['palestenian_group'], row['jewish_group'], row['students_group'],
                row['teachers_group'], row['women_group'], row['political_group'],
                row['lgbt_group']
            ])
        )
    else:
        # For missing or invalid 'assoc_actor_1', mark all columns 0 except 'other_group'
        row['palestenian_group'] = 0
        row['jewish_group'] = 0
        row['students_group'] = 0
        row['teachers_group'] = 0
        row['women_group'] = 0
        row['political_group'] = 0
        row['lgbt_group'] = 0
        row['other_group'] = 1

    return row

# Apply the function to the DataFrame
data = data.apply(map_groups, axis=1)

# # Display the selected columns
# df[new_columns].head(14)

In [113]:
data['event_date'] = pd.to_datetime(data['event_date'], format='%d/%m/%Y')
data.head(30)

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,palestenian_group,...,source,source_scale,notes,fatalities,tags,timestamp,Pro Israel,Pro Palestine,Violent,Crowd_size
0,USA74355,2024-11-08,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States),0,...,Harvard Crimson,Subnational,"On 8 November 2024, roughly 35 pro-Palestinian...",0,roughly 35,1731434806,0,1,0,35
1,USA74437,2024-11-08,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,0,...,Democracy Now!; Popular Resistance,Other-National,"On 8 November 2024, pro-Palestinian protesters...",0,no report,1731434806,0,1,0,-1
2,USA74341,2024-11-07,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),AMP: American Muslims for Palestine; Muslim Gr...,1,...,Anti-Defamation League; Bergen Record,Subnational-National,"On 7 November 2024, pro-Palestinian protesters...",0,counter-demonstration; no report,1731434806,1,1,0,-1
3,USA74480,2024-11-07,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States); Women (United States),0,...,Twitter,New media,"On 7 November 2024, students rallied at Virgin...",0,no report,1731434807,0,0,0,-1
4,USA74522,2024-11-07,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States); Students (United Sta...,1,...,Arizona Public Media,Subnational,"On 7 November 2024, about 100 pro-Palestinian ...",0,counter-demonstration; about 104,1731434807,1,1,0,104
5,USA74343,2024-11-06,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States),0,...,KQED; KTVU Fox2,Subnational,"On 6 November 2024, about two dozen to around ...",0,counter-demonstration; about two dozen to arou...,1731434806,0,1,0,100
6,USA74362,2024-11-06,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),USPCN: US Palestinian Community Network; Arab ...,1,...,Block Club Chicago; Chicago Tribune; Post Mill...,Subnational-Regional,"On 6 November 2024, over 500 pro-Palestinian p...",0,over 500,1731434806,0,1,0,500
7,USA74363,2024-11-06,2024,1,Demonstrations,Protests,Protest with intervention,Protesters (United States),USPCN: US Palestinian Community Network; Arab ...,1,...,ABC7 Chicago,Subnational,"On 6 November 2024, about 30 pro-Palestinian p...",0,about 30,1731434806,0,1,1,30
8,USA74365,2024-11-06,2024,1,Political violence,Riots,Mob violence,Rioters (United States),,0,...,ABC7 Chicago,Subnational,"On 6 November 2024, two unidentified men in sk...",0,2,1731434806,0,0,1,2
9,USA74399,2024-11-06,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States),0,...,Cornell Daily Sun,Subnational,"On 6 November 2024, pro-Palestinian protesters...",0,no report,1731434806,0,1,1,-1


In [117]:
data.to_csv("/Users/ofekzini/Downloads/protests us final.csv")