In [8]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
# Load the CSV file
file_path = '/Users/ofekzini/Documents/Data Engineering/Fall 2024/ויזואליזציה/Project/Data/North America protests 2023 2024.csv'
data = pd.read_csv(file_path)

# Create a copy of the DataFrame
df_copy = data.copy()


In [17]:
# Filter rows where 'notes' column contains 'ISRAEL', 'PALESTINE', or 'WAR' (case-insensitive)

# keywords = ['ISRAEL', 'PALESTINE', 'WAR']
# df_copy = df_copy[df_copy['notes'].str.contains('|'.join(keywords), case=False, na=False)]

keywords = ['ISRAEL', 'PALESTINE', 'WAR']
# Create a regex pattern to match whole words (case-insensitive)
pattern = r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'

# Filter rows where 'notes' contains the exact words
df_copy = df_copy[df_copy['notes'].str.contains(pattern, case=False, na=False)]

In [18]:
# Extract numeric values from the 'tags' column
def extract_numeric(tag):
    numbers = re.findall(r'\d+', str(tag))
    return list(map(int, numbers))

In [19]:
# Replace tags based on conditions and create 'Crowd_size' column
def update_tags(tag):
    tag = str(tag).lower()
    if 'dozens' in tag and 'hundreds' in tag:  # If both "dozens" and "hundreds" are present
        return 100
    elif 'hundreds' in tag:  # If "hundreds" is present
        return median_three_digit if median_three_digit is not None else tag
    elif 'dozens' in tag:  # If "dozens" is present
        return median_two_digit if median_two_digit is not None else tag
    else:
        numeric_values = extract_numeric(tag)
        return numeric_values[0] if numeric_values else None

In [20]:
# Calculate median for tags with 3-digit numbers and 2-digit numbers
three_digit_values = [num for tag in df_copy['tags'] for num in extract_numeric(tag) if 100 <= num < 1000]
two_digit_values = [num for tag in df_copy['tags'] for num in extract_numeric(tag) if 10 <= num < 100]

median_three_digit = int(pd.Series(three_digit_values).median()) if three_digit_values else None
median_two_digit = int(pd.Series(two_digit_values).median()) if two_digit_values else None

In [24]:
df_copy['Crowd_size'] = df_copy['tags'].apply(update_tags)

# Save the modified DataFrame to a new CSV file
# output_path = 'Data/North_America_protests_filtered.csv'
# df_copy.to_csv(output_path, index=False)

# Print confirmation of the new file and median values
print(f"Modified dataset saved to {output_path}")
print(f"Median for hundreds: {median_three_digit}")
print(f"Median for dozens: {median_two_digit}")

Modified dataset saved to Data/North_America_protests_filtered.csv
Median for hundreds: 150
Median for dozens: 50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['Crowd_size'] = df_copy['tags'].apply(update_tags)


In [22]:
df_copy

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp,Crowd_size
17,USA74355,08 November 2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States),Protesters,...,42.3736,-71.1097,1,Harvard Crimson,Subnational,"On 8 November 2024, roughly 35 pro-Palestinian...",0,crowd size=roughly 35,1731434806,35.0
25,USA74437,08 November 2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,40.7834,-73.9663,1,Democracy Now!; Popular Resistance,Other-National,"On 8 November 2024, pro-Palestinian protesters...",0,crowd size=no report,1731434806,
49,USA74341,07 November 2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),AMP: American Muslims for Palestine; Muslim Gr...,Protesters,...,40.9272,-73.9975,1,Anti-Defamation League; Bergen Record,Subnational-National,"On 7 November 2024, pro-Palestinian protesters...",0,counter-demonstration; crowd size=no report,1731434806,
61,USA74480,07 November 2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States); Women (United States),Protesters,...,37.5388,-77.4336,1,Twitter,New media,"On 7 November 2024, students rallied at Virgin...",0,crowd size=no report,1731434807,
68,USA74522,07 November 2024,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States); Students (United Sta...,Protesters,...,32.2217,-110.9264,1,Arizona Public Media,Subnational,"On 7 November 2024, about 100 pro-Palestinian ...",0,counter-demonstration; crowd size=about 104,1731434807,104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21895,USA60877,08 October 2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),ANSWER: Act Now to Stop War and End Racism; CA...,Protesters,...,38.8875,-77.0364,1,DC News Now; It's Going Down; Liberation News;...,Other-Subnational,"On 8 October 2023, a pro-Palestinian group of ...",0,crowd size=hundreds,1729632871,150.0
21896,USA60878,08 October 2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,39.9525,-75.1636,1,Liberation News,Other,"On 8 October 2023, a pro-Palestinian group of ...",0,crowd size=no report,1729632871,
21913,CAN5081,07 October 2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (Canada),,Protesters,...,43.5830,-79.6448,1,Toronto Sun,Subnational,"On 7 October 2023, dozens of people held a pro...",0,crowd size=dozens,1697584399,50.0
21921,USA60762,07 October 2023,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,40.7834,-73.9663,1,Protest_NYC,Subnational,"On 7 October 2023, a handful of protesters hel...",0,crowd size=handful,1697584399,


# Data encoding

In [72]:
na_f_data = pd.read_csv("/Users/ofekzini/Documents/Data Engineering/Fall 2024/ויזואליזציה/Project/Data/NA_protests_filtered.csv")
na_f_data
print(na_f_data.columns)

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'Crowd_size', 'Pro_Israel', 'Pro_Palestine',
       'Violent'],
      dtype='object')


In [73]:
word_bank = ["students", "Pro-palestine", "Pro-Israel", "university", "pro-Palestinian", 
             "Condemn Israel", "Israel", "Palestine", "SJP", "College", 
             "Injuries", "Support palestine", "Hostages", "Arrested"]

# Add Boolean features for each word in the word bank
for word in word_bank:
    # Normalize column names to lowercase and replace spaces with underscores
    col_name = word.lower().replace(" ", "_")
    
    # Check if the column name already exists to avoid overwriting
    if col_name not in na_f_data.columns:
        na_f_data[col_name] = na_f_data['notes'].str.contains(word, case=False, na=False).astype(int)
    else:
        print(f"Column '{col_name}' already exists, skipping creation.")


In [74]:
print(na_f_data.columns)

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'Crowd_size', 'Pro_Israel', 'Pro_Palestine',
       'Violent', 'students', 'pro-palestine', 'pro-israel', 'university',
       'pro-palestinian', 'condemn_israel', 'israel', 'palestine', 'sjp',
       'college', 'injuries', 'support_palestine', 'hostages', 'arrested'],
      dtype='object')


In [75]:
# Fill NaN in 'Crowd_size' with a specific value (e.g., -1)
na_f_data['Crowd_size'] = na_f_data['Crowd_size'].fillna(-1)

# List of columns to exclude from dropping (retain these columns)
exclude_columns = ['Violent']  # Add other important columns to this list if needed

# Separate columns that need to be cleaned from the columns you want to exclude
cols_to_clean = na_f_data.columns.difference(exclude_columns)

# Drop NaN values only in the columns that need to be cleaned
na_f_data_cleaned = na_f_data[cols_to_clean].dropna(axis=1, how='any')

# Reassign the cleaned columns back to the original DataFrame
na_f_data[na_f_data_cleaned.columns] = na_f_data_cleaned

# Display the result
na_f_data


Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,pro-palestinian,condemn_israel,israel,palestine,sjp,college,injuries,support_palestine,hostages,arrested
0,USA74355,08-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States),Protesters,...,1,0,1,1,0,0,0,1,0,0
1,USA74437,08-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,1,1,0,0,0,0,0,0,0
2,USA74341,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),AMP: American Muslims for Palestine; Muslim Gr...,Protesters,...,1,0,1,1,0,0,0,1,0,0
3,USA74480,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States); Women (United States),Protesters,...,0,0,1,1,0,0,0,0,0,0
4,USA74522,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States); Students (United Sta...,Protesters,...,1,0,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5686,USA60877,08-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),ANSWER: Act Now to Stop War and End Racism; CA...,Protesters,...,1,0,1,1,0,0,0,0,0,0
5687,USA60878,08-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,1,1,1,0,0,0,1,0,0
5688,CAN5081,07-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (Canada),,Protesters,...,0,0,1,1,0,0,0,1,0,0
5689,USA60762,07-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,0,1,1,0,0,0,0,0,0


In [76]:
print(na_f_data.columns)

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'Crowd_size', 'Pro_Israel', 'Pro_Palestine',
       'Violent', 'students', 'pro-palestine', 'pro-israel', 'university',
       'pro-palestinian', 'condemn_israel', 'israel', 'palestine', 'sjp',
       'college', 'injuries', 'support_palestine', 'hostages', 'arrested'],
      dtype='object')


In [77]:
# Ensure there are no trailing spaces in column names
na_f_data.columns = na_f_data.columns.str.strip()

# Check if the target columns exist
target_columns = ['pro-israel', 'pro-palestine', 'Violent']
for target_col in target_columns:
    if target_col not in na_f_data.columns:
        print(f"Warning: Column '{target_col}' not found.")

# Filter rows with labels for training (non-TBD)
train_data = na_f_data[~na_f_data[target_columns].eq('TBD').any(axis=1)]
# Filter rows with 'TBD' values for testing
test_data = na_f_data[na_f_data[target_columns].eq('TBD').any(axis=1)]

# Define feature columns and target columns
feature_cols = na_f_data.columns.difference(target_columns)

# Iterate over each target column
for target_col in target_columns:
    print(f"Classifying {target_col}")

    # Prepare training data for the specific column
    X_train = train_data[feature_cols]
    y_train = train_data[target_col]

    # Handle non-numeric values in X_train
    X_train = X_train.replace('NR', float('nan'))  # Replace 'NR' or similar non-numeric values
    X_train = X_train.apply(pd.to_numeric, errors='coerce')  # Convert to numeric
    X_train = X_train.fillna(0)  # Fill NaN values with 0 or another imputation strategy

    # Handle non-numeric values in y_train if necessary
    if y_train.dtypes == object:
        unique_labels = y_train.dropna().unique()
        label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
        y_train_encoded = y_train.map(label_mapping)
    else:
        y_train_encoded = y_train

    # Prepare testing data for rows with TBD in this column
    X_test = test_data[feature_cols]

    # Handle non-numeric values in X_test
    X_test = X_test.replace('NR', float('nan'))
    X_test = X_test.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.fillna(0)

    # Train the classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train_encoded)

    # Predict for TBD rows
    predictions = model.predict(X_test)

    # Print out the lengths to debug
    print(f"Number of 'TBD' rows in target column '{target_col}':", len(test_data[test_data[target_col] == 'TBD']))
    print(f"Number of predictions for '{target_col}':", len(predictions))

    # Ensure that predictions have the same length as the rows with 'TBD'
    if len(predictions) == len(test_data[test_data[target_col] == 'TBD']):
        # Map predictions back to original labels if necessary
        if y_train.dtypes == object:
            reverse_label_mapping = {idx: label for label, idx in label_mapping.items()}
            predictions = pd.Series(predictions).map(reverse_label_mapping)

        # Update the DataFrame with predictions
        na_f_data.loc[na_f_data[target_col] == 'TBD', target_col] = predictions
    else:
        print(f"Error: The number of predictions does not match the number of 'TBD' rows for column '{target_col}'.")

# Display the updated DataFrame
na_f_data

Classifying pro-israel
Number of 'TBD' rows in target column 'pro-israel': 0
Number of predictions for 'pro-israel': 5072
Error: The number of predictions does not match the number of 'TBD' rows for column 'pro-israel'.
Classifying pro-palestine
Number of 'TBD' rows in target column 'pro-palestine': 0
Number of predictions for 'pro-palestine': 5072
Error: The number of predictions does not match the number of 'TBD' rows for column 'pro-palestine'.
Classifying Violent
Number of 'TBD' rows in target column 'Violent': 5072
Number of predictions for 'Violent': 5072


Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,pro-palestinian,condemn_israel,israel,palestine,sjp,college,injuries,support_palestine,hostages,arrested
0,USA74355,08-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States),Protesters,...,1,0,1,1,0,0,0,1,0,0
1,USA74437,08-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,1,1,0,0,0,0,0,0,0
2,USA74341,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),AMP: American Muslims for Palestine; Muslim Gr...,Protesters,...,1,0,1,1,0,0,0,1,0,0
3,USA74480,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Students (United States); Women (United States),Protesters,...,0,0,1,1,0,0,0,0,0,0
4,USA74522,07-Nov-24,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Teachers (United States); Students (United Sta...,Protesters,...,1,0,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5686,USA60877,08-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),ANSWER: Act Now to Stop War and End Racism; CA...,Protesters,...,1,0,1,1,0,0,0,0,0,0
5687,USA60878,08-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,1,1,1,0,0,0,1,0,0
5688,CAN5081,07-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (Canada),,Protesters,...,0,0,1,1,0,0,0,1,0,0
5689,USA60762,07-Oct-23,2023,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,1,0,1,1,0,0,0,0,0,0


In [78]:
output_path = 'Data/NA_new2.csv'
na_f_data.to_csv(output_path, index=False)