In [1]:
# Connect google drive account
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports
import os
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import PhraseMatcher

In [3]:
# Import files
file1 = "/content/drive/My Drive/gdelt_protests_2018_2021/protests_1.csv"
file2 = "/content/drive/My Drive/gdelt_protests_2018_2021/protests_1.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

df = pd.concat([df1, df2], ignore_index = True)

In [4]:
# Convert Data Format
df['SQLDATE'] = pd.to_datetime(df['SQLDATE'], format='%Y%m%d', errors='coerce')


In [5]:
# Drop rows with missing data
df = df.dropna(subset=['SQLDATE', 'ActionGeo_Lat', 'ActionGeo_Long', 'AvgTone']).copy()


# Round data to make charts easier to read
df['AvgTone'] = df['AvgTone'].round(2)
df['GoldsteinScale'] = df['GoldsteinScale'].round(2)


df.head()

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long
0,2018-06-12,,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0
1,2018-10-18,,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0
2,2018-12-21,,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0
3,2018-10-10,,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5
4,2018-07-16,,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8


In [6]:
# Replace missing actors with unknown, and lowercase all actor names
df['Actor1Name'] = df['Actor1Name'].fillna('Unknown Actor 1')
df['Actor2Name'] = df['Actor2Name'].fillna('Unknown Actor 2')

df.head()

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long
0,2018-06-12,Unknown Actor 1,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0
1,2018-10-18,Unknown Actor 1,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0
2,2018-12-21,Unknown Actor 1,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0
3,2018-10-10,Unknown Actor 1,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5
4,2018-07-16,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8


In [7]:
# Create a new column to see if the protest was pre-COVID (3/1/2020)

df['COVID_Era'] = np.where(df['SQLDATE'] < '2020-03-01', 'Pre-COVID', 'COVID-Era')

In [8]:
# Track motivations of the protest using the Event Code

# Convert EventCode to string if it's numeric
df['EventCode'] = df['EventCode'].astype(str)

# Define conditions and corresponding motivations
conditions = [
    df['EventCode'] == '141',
    df['EventCode'] == '142',
    df['EventCode'] == '143',
    df['EventCode'] == '144',
    df['EventCode'] == '145'
]

motivations = [
    'Policy Change',
    'Anti-Government',
    'Anti-Business',
    'Group Rights',
    'Anti-Discrimination'
]

# Default fallback if no match
df['ProtestMotivation'] = np.select(conditions, motivations, default='General Protest')


In [9]:
# Check the results
df.head(10)

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,COVID_Era,ProtestMotivation
0,2018-06-12,Unknown Actor 1,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0,Pre-COVID,General Protest
1,2018-10-18,Unknown Actor 1,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0,Pre-COVID,Policy Change
2,2018-12-21,Unknown Actor 1,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0,Pre-COVID,General Protest
3,2018-10-10,Unknown Actor 1,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5,Pre-COVID,Policy Change
4,2018-07-16,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8,Pre-COVID,Policy Change
5,2019-07-15,Unknown Actor 1,MINIST OF SECURITY,14,141,-6.5,-3.76,TO,8.0,1.166667,Pre-COVID,Policy Change
6,2019-02-16,Unknown Actor 1,PRESIDENT,14,140,-6.5,-6.82,HA,18.5392,-72.335,Pre-COVID,General Protest
7,2019-05-01,Unknown Actor 1,MACEDONIA,14,141,-6.5,-5.26,MK,42.0,21.4333,Pre-COVID,Policy Change
8,2018-12-25,Unknown Actor 1,POLICE,14,141,-6.5,-5.81,BK,43.9061,18.3208,Pre-COVID,Policy Change
9,2018-11-25,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-9.22,AL,41.2314,20.1561,Pre-COVID,Policy Change


Some actor2Names are countries, which makes python string matching inefficient. We may need to use NER to improve accuracy. We can get a list of unique actor names and see what are the patterns and outliers.

In [10]:
# Get unique actor names
actor2_unique = df['Actor2Name'].dropna().str.upper().value_counts().head(100)
actor2_unique.to_frame().reset_index().rename(columns={'index': 'Actor2Name', 'Actor2Name': 'Count'})


Unnamed: 0,Count,count
0,UNKNOWN ACTOR 2,3211058
1,UNITED STATES,337842
2,POLICE,298858
3,GOVERNMENT,278388
4,PRESIDENT,158670
...,...,...
95,CIVILIAN,12810
96,IRELAND,12732
97,MIGRANT,12554
98,GERMAN,12552


In [11]:
# 1. Load spaCy and build a PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

!pip install pycountry
import pycountry

patterns = {
    "Civilians":          ["protester", "demonstrator", "student", "worker", "citizen"],
    "Government":         ["police", "regime", "parliament", "army", "military", "government", "security", "state", "president", "authorities", "authority", "prime minister", "chancellor", "congress", "legislature", "court", "judiciary", "the white house"],
    "Political Party":    ["party", "minister", "candidate", "politician", "congressman", "congresswoman"],
    "NGO / Advocacy":     ["ngo", "nonprofit", "human rights", "activist", "charity", "organization"],
    "Corporate / Business":["company", "corporation", "bank", "industry", "firm", "business"],
    "Agriculture":        ["farm", "farmer", "agriculture", "landowner"],
    "Healthcare":         ["hospital", "medical", "healthcare", "nurse", "doctor"],
    "Prison Reform":      ["prison", "incarceration", "inmate", "detention"],
    "Media Reform":       ["media", "press", "journalist", "news agency"],
    "Religious":          ["christian", "muslim", "hindu", "jewish", "buddhism", "jain"]
}

for label, terms in patterns.items():
    matcher.add(label, [nlp.make_doc(t) for t in terms])

# 2) Seed every ISO country name into Government
country_docs = [nlp.make_doc(c.name) for c in pycountry.countries]
matcher.add("Government", country_docs)

# 3) Deduplicate and build classification map
all_actors = pd.concat([df['Actor1Name'], df['Actor2Name']]).dropna().unique()
name_to_cat = {}

for name in all_actors:
    # Title-case for NER but keep matcher case-insensitive
    doc = nlp(name if name and name[0].isupper() else name.title())

    # 3a) PhraseMatcher first
    matches = matcher(doc)
    if matches:
        name_to_cat[name] = nlp.vocab.strings[matches[0][0]]
        continue

    # 3b) NER fallback on properly-cased doc
    if doc.ents:
        ent = doc.ents[0].label_
        if ent in ("GPE","LOC"):
            name_to_cat[name] = "Government"
        elif ent == "ORG":
            name_to_cat[name] = "NGO / Advocacy"
        elif ent == "NORP":
            name_to_cat[name] = "Civilians"
        else:
            name_to_cat[name] = "Unknown"
    else:
        name_to_cat[name] = "Unknown"

# Vectorized map back to data frame
df['PrimaryActorType']   = np.vectorize(name_to_cat.get)(df['Actor1Name'].values)
df['SecondaryActorType'] = np.vectorize(name_to_cat.get)(df['Actor2Name'].values)

# Override ProtestMotivation for nuance targets
nuance = ["Agriculture","Healthcare","Prison Reform","Media Reform"]
mask = df['SecondaryActorType'].isin(nuance)
df.loc[mask, 'ProtestMotivation'] = df.loc[mask, 'SecondaryActorType']

# Check data
(df['PrimaryActorType'].value_counts(normalize=True) * 100).round(2)  # coverage
df[['Actor2Name','SecondaryActorType','ProtestMotivation']].sample(10)

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/6.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/6.3 MB[0m [31m18.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.3/6.3 MB[0m [31m60.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


Unnamed: 0,Actor2Name,SecondaryActorType,ProtestMotivation
4739935,BARCELONA,Unknown,Policy Change
8857163,Unknown Actor 2,Unknown,Anti-Business
2929368,Unknown Actor 2,Unknown,Policy Change
9683616,WORKER,Civilians,Policy Change
7588861,SECURITY FORCE,Government,Anti-Discrimination
8486578,FRANKFORT,Unknown,Policy Change
9157062,UNITED STATES,Government,Policy Change
1911549,Unknown Actor 2,Unknown,Policy Change
6576149,PARLIAMENT,Government,General Protest
3045373,DEMONSTRATOR,Civilians,Policy Change


In [12]:
# Check results
df.head(15)

Unnamed: 0,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventCode,GoldsteinScale,AvgTone,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,COVID_Era,ProtestMotivation,PrimaryActorType,SecondaryActorType
0,2018-06-12,Unknown Actor 1,BOLIVIA,14,140,-6.5,-0.47,BL,-17.0,-65.0,Pre-COVID,General Protest,Unknown,Unknown
1,2018-10-18,Unknown Actor 1,GUINEA BISSAU,14,141,-6.5,0.21,GV,11.0,-10.0,Pre-COVID,Policy Change,Unknown,Government
2,2018-12-21,Unknown Actor 1,EMPLOYEE,14,140,-6.5,-6.28,HU,47.0,20.0,Pre-COVID,General Protest,Unknown,Unknown
3,2018-10-10,Unknown Actor 1,SOUTH KOREA,14,141,-6.5,-3.59,KS,37.0,127.5,Pre-COVID,Policy Change,Unknown,Government
4,2018-07-16,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-0.24,SN,1.366667,103.8,Pre-COVID,Policy Change,Unknown,Government
5,2019-07-15,Unknown Actor 1,MINIST OF SECURITY,14,141,-6.5,-3.76,TO,8.0,1.166667,Pre-COVID,Policy Change,Unknown,Government
6,2019-02-16,Unknown Actor 1,PRESIDENT,14,140,-6.5,-6.82,HA,18.5392,-72.335,Pre-COVID,General Protest,Unknown,Government
7,2019-05-01,Unknown Actor 1,MACEDONIA,14,141,-6.5,-5.26,MK,42.0,21.4333,Pre-COVID,Policy Change,Unknown,Unknown
8,2018-12-25,Unknown Actor 1,POLICE,14,141,-6.5,-5.81,BK,43.9061,18.3208,Pre-COVID,Policy Change,Unknown,Government
9,2018-11-25,Unknown Actor 1,PARLIAMENT,14,141,-6.5,-9.22,AL,41.2314,20.1561,Pre-COVID,Policy Change,Unknown,Government


In [13]:
# Check the count of unknown actors in comparison to correctly categorized
uncategorized_actor_1 = df[df['PrimaryActorType'] == 'Unknown']['Actor1Name'].count()
uncategorized_actor_2 = df[df['SecondaryActorType'] == 'Unknown']['Actor2Name'].count()

print(f"Unknown actors in Actor1: {uncategorized_actor_1}")
print(f"Unknown actors in Actor2: {uncategorized_actor_2}")

total_actors = len(df)
print(f"Total actors: {total_actors}")

percent_uknown_actors_1 = (uncategorized_actor_1) / total_actors * 100
percent_uknown_actors_2 = (uncategorized_actor_2) / total_actors * 100

print(f"Percentage of unknown primary actors: {percent_uknown_actors_1:.2f}%")
print(f"Percentage of unknown secondary actors: {percent_uknown_actors_2:.2f}%")

Unknown actors in Actor1: 2426032
Unknown actors in Actor2: 4226430
Total actors: 9405730
Percentage of unknown primary actors: 25.79%
Percentage of unknown secondary actors: 44.93%


NER and phrase matching were a good first pass at categorizing the primary and secondary actor types, however, 48.95% unknown secondary actors is a bit high.

We can explore the top remaining actor labels and try to improve the matching. We can also use the CAMEO event codes as hints for what category the secondary actor belongs to. A '142' event code corresponds to anti-government protests, so it is very plausible that the secondary actor is the government.

In [14]:
# Check remaining top unknown actor 1 names
unknown1 = df[df['PrimaryActorType']=='Unknown']['Actor1Name'].value_counts().head(50)

unknown1.head(10)

Unnamed: 0_level_0,count
Actor1Name,Unnamed: 1_level_1
Unknown Actor 1,914488
RUSSIA,87512
RESIDENTS,66840
EMPLOYEE,41022
UKRAINIAN,33000
VENEZUELA,30646
VILLAGE,23894
DELHI,20648
COMPANIES,18546
GOVERNOR,18508


In [15]:
# Check remaining top unknown actor 2 names
unknown2 = df[df['SecondaryActorType']=='Unknown']['Actor2Name'].value_counts().head(50)

unknown2.head(10)

Unnamed: 0_level_0,count
Actor2Name,Unnamed: 1_level_1
Unknown Actor 2,3211058
RUSSIA,80330
VENEZUELA,26244
RESIDENTS,19834
TURKISH,19800
DELHI,19018
GOVERNOR,18492
EMPLOYEE,16734
UKRAINIAN,16390
VILLAGE,16218


Adding the additional categories has improved the completeness of the category matching for the primary actor by 8%, and the secondary by nearly 4%. I think it is unlikely to make significant strides with actor 2, considering a significant proportion of those left uncategorized are unknown. The country list has helped, but Russia, Venezuela, and Ukraine continue to be under matched. They likely will have to be manually inputted.