In [1]:
# Connect google drive account
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports
import os
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import PhraseMatcher

In [3]:
# Import files
file1 = "/content/drive/My Drive/gdelt_protests_2018_2021/protests_1.csv"
file2 = "/content/drive/My Drive/gdelt_protests_2018_2021/protests_1.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

df = pd.concat([df1, df2], ignore_index = True)

In [4]:
# Convert Data Format
df['SQLDATE'] = pd.to_datetime(df['SQLDATE'], format='%Y%m%d', errors='coerce')


In [5]:
# Drop rows with missing data
df = df.dropna(subset=['SQLDATE', 'ActionGeo_Lat', 'ActionGeo_Long', 'AvgTone']).copy()


# Round data to make charts easier to read
df['AvgTone'] = df['AvgTone'].round(2)
df['GoldsteinScale'] = df['GoldsteinScale'].round(2)


df.head()

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long
0,2018-06-12,,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0
1,2018-10-18,,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0
2,2018-12-21,,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0
3,2018-10-10,,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5
4,2018-07-16,,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8


In [6]:
# Replace missing actors with unknown, and lowercase all actor names
df['Actor1Name'] = df['Actor1Name'].fillna('Unknown Actor 1')
df['Actor2Name'] = df['Actor2Name'].fillna('Unknown Actor 2')

df.head()

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long
0,2018-06-12,Unknown Actor 1,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0
1,2018-10-18,Unknown Actor 1,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0
2,2018-12-21,Unknown Actor 1,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0
3,2018-10-10,Unknown Actor 1,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5
4,2018-07-16,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8


In [7]:
# Create a new column to see if the protest was pre-COVID (3/1/2020)

df['COVID_Era'] = np.where(df['SQLDATE'] < '2020-03-01', 'Pre-COVID', 'COVID-Era')

In [8]:
# Track motivations of the protest using the Event Code

# Convert EventCode to string if it's numeric
df['EventCode'] = df['EventCode'].astype(str)

# Define conditions and corresponding motivations
conditions = [
    df['EventCode'] == '141',
    df['EventCode'] == '142',
    df['EventCode'] == '143',
    df['EventCode'] == '144',
    df['EventCode'] == '145'
]

motivations = [
    'Policy Change',
    'Anti-Government',
    'Anti-Business',
    'Group Rights',
    'Anti-Discrimination'
]

# Default fallback if no match
df['ProtestMotivation'] = np.select(conditions, motivations, default='General Protest')


In [9]:
# Check the results
df.head(10)

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,COVID_Era,ProtestMotivation
0,2018-06-12,Unknown Actor 1,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0,Pre-COVID,General Protest
1,2018-10-18,Unknown Actor 1,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0,Pre-COVID,Policy Change
2,2018-12-21,Unknown Actor 1,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0,Pre-COVID,General Protest
3,2018-10-10,Unknown Actor 1,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5,Pre-COVID,Policy Change
4,2018-07-16,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8,Pre-COVID,Policy Change
5,2019-07-15,Unknown Actor 1,MINIST OF SECURITY,14,141,-6.5,-3.76,TO,8.0,1.166667,Pre-COVID,Policy Change
6,2019-02-16,Unknown Actor 1,PRESIDENT,14,140,-6.5,-6.82,HA,18.5392,-72.335,Pre-COVID,General Protest
7,2019-05-01,Unknown Actor 1,MACEDONIA,14,141,-6.5,-5.26,MK,42.0,21.4333,Pre-COVID,Policy Change
8,2018-12-25,Unknown Actor 1,POLICE,14,141,-6.5,-5.81,BK,43.9061,18.3208,Pre-COVID,Policy Change
9,2018-11-25,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-9.22,AL,41.2314,20.1561,Pre-COVID,Policy Change


Some actor2Names are countries, which makes python string matching inefficient. We may need to use NER to improve accuracy. We can get a list of unique actor names and see what are the patterns and outliers.

In [10]:
# Get unique actor names
actor2_unique = df['Actor2Name'].dropna().str.upper().value_counts().head(100)
actor2_unique.to_frame().reset_index().rename(columns={'index': 'Actor2Name', 'Actor2Name': 'Count'})


Unnamed: 0,Count,count
0,UNKNOWN ACTOR 2,3211058
1,UNITED STATES,337842
2,POLICE,298858
3,GOVERNMENT,278388
4,PRESIDENT,158670
...,...,...
95,CIVILIAN,12810
96,IRELAND,12732
97,MIGRANT,12554
98,GERMAN,12552


In [23]:
# 1. Load spaCy and build a PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

!pip install pycountry
import pycountry

patterns = {
    "Civilians":           ["protester","demonstrator","student","worker","citizen","residents","village","employee","ukrainian"],
    "Government":          ["police", "republic", "kingdom", "state","governor","regime","parliament","army","military","government","security","state","president","authorities","authority","prime minister","chancellor","congress","legislature","court","judiciary","the white house","russia","venezuela","ukraine"],
    "Political Party":     ["party","minister","candidate","politician","congressman","congresswoman"],
    "NGO / Advocacy":      ["ngo","nonprofit","human rights","activist","charity","organization"],
    "Corporate / Business":["company","corporation","bank","industry","firm","business","companies"],
    "Agriculture":         ["farm","farmer","agriculture","landowner"],
    "Healthcare":          ["hospital","medical","healthcare","nurse","doctor"],
    "Prison Reform":       ["prison","incarceration","inmate","detention"],
    "Media Reform":        ["media","press","journalist","news agency"],
    "Religious":           ["christian","muslim","hindu","jewish","buddhism","jain"]
}

for label, terms in patterns.items():
    matcher.add(label, [nlp.make_doc(t) for t in terms])

# 2) Seed every ISO country name into Government
country_docs = [nlp.make_doc(c.name) for c in pycountry.countries]
matcher.add("Government", country_docs)

# 3) Deduplicate and build classification map
all_actors = pd.concat([df['Actor1Name'], df['Actor2Name']]).dropna().unique()
name_to_cat = {}

for name in all_actors:
    # Title-case for NER but keep matcher case-insensitive
    doc = nlp(name if name and name[0].isupper() else name.title())

    # 3a) PhraseMatcher first
    matches = matcher(doc)
    if matches:
        name_to_cat[name] = nlp.vocab.strings[matches[0][0]]
        continue

    # 3b) NER fallback on properly-cased doc
    if doc.ents:
        ent = doc.ents[0].label_
        if ent in ("GPE","LOC"):
            name_to_cat[name] = "Government"
        elif ent == "ORG":
            name_to_cat[name] = "NGO / Advocacy"
        elif ent == "NORP":
            lower = name.lower()
            if any(r in lower for r in patterns["Religious"]):
                name_to_cat[name] = "Religious"
            else:
                name_to_cat[name] = "Civilians"
        else:
            name_to_cat[name] = "Unknown"
    else:
        name_to_cat[name] = "Unknown"


# Vectorized map back to data frame
df['PrimaryActorType']   = np.vectorize(name_to_cat.get)(df['Actor1Name'].values)
df['SecondaryActorType'] = np.vectorize(name_to_cat.get)(df['Actor2Name'].values)

# Override ProtestMotivation for nuance targets
nuance = ["Agriculture","Healthcare","Prison Reform","Media Reform","Religious"]
mask = df['SecondaryActorType'].isin(nuance)
df.loc[mask, 'ProtestMotivation'] = df.loc[mask, 'SecondaryActorType']

# Check data
(df['PrimaryActorType'].value_counts(normalize=True) * 100).round(2)  # coverage
df[['Actor2Name','SecondaryActorType','ProtestMotivation']].sample(10)



Unnamed: 0,Actor2Name,SecondaryActorType,ProtestMotivation
8410061,KIEV,NGO / Advocacy,Policy Change
4989523,Unknown Actor 2,Unknown,Policy Change
454452,Unknown Actor 2,Unknown,Policy Change
5562182,Unknown Actor 2,Unknown,Policy Change
6511377,DHAKA,NGO / Advocacy,Policy Change
7962048,GOVERNMENT,Government,Policy Change
3948342,UNITED STATES,Government,Anti-Government
5816669,Unknown Actor 2,Unknown,Policy Change
7602384,WASHINGTON,Government,Policy Change
8926217,UNITED STATES,Government,Policy Change


According to GDELT's CAMEO naming convention, Actor1 protests 'against' Actor2. But, in the above nationalities in actor2 are misclassified as civillian, when they are more likely to be the government which is being protested against. We need to override the secondary actor type here, and ensure that if a country's denonoym appears as Actor2, the category is 'government' and not civillian.

In [24]:
# Only override secondary category where the protest is anti-government
mask_anti_gov = df['EventRootCode'] == '142'
df.loc[mask_anti_gov, 'SecondaryActorType'] = 'Government'


In [25]:
df[['Actor2Name','SecondaryActorType','ProtestMotivation']].sample(10)

Unnamed: 0,Actor2Name,SecondaryActorType,ProtestMotivation
409117,STUDENT,Civilians,Anti-Business
2900320,IRANIAN,Civilians,General Protest
2451188,CYPRUS,Government,Policy Change
6001745,Unknown Actor 2,Unknown,Policy Change
4535025,BASHAR AL ASSAD,Unknown,Policy Change
2201291,RUSSIA,Government,Anti-Business
8411689,CATHOLIC,NGO / Advocacy,Policy Change
9709424,BRITON,NGO / Advocacy,Policy Change
6067760,NICARAGUA,Government,Policy Change
4392842,CHINA,Government,Policy Change


In [26]:
# Check the count of unknown actors in comparison to correctly categorized
uncategorized_actor_1 = df[df['PrimaryActorType'] == 'Unknown']['Actor1Name'].count()
uncategorized_actor_2 = df[df['SecondaryActorType'] == 'Unknown']['Actor2Name'].count()

print(f"Unknown actors in Actor1: {uncategorized_actor_1}")
print(f"Unknown actors in Actor2: {uncategorized_actor_2}")

total_actors = len(df)
print(f"Total actors: {total_actors}")

percent_uknown_actors_1 = (uncategorized_actor_1) / total_actors * 100
percent_uknown_actors_2 = (uncategorized_actor_2) / total_actors * 100

print(f"Percentage of unknown primary actors: {percent_uknown_actors_1:.2f}%")
print(f"Percentage of unknown secondary actors: {percent_uknown_actors_2:.2f}%")

Unknown actors in Actor1: 2098624
Unknown actors in Actor2: 4016714
Total actors: 9405730
Percentage of unknown primary actors: 22.31%
Percentage of unknown secondary actors: 42.70%


The improvements to the matcher have reduced the unknown actors marginally. However, because secondary actors, AKA targets of the protests, can be individually people, as well as geographic areas smaller than countries it is harder to significantly increase coverage.