In [39]:
import pandas as pd

In [40]:
gdelt = pd.read_parquet('../cache/gdelt.parquet')

In [41]:
gdelt.head()

Unnamed: 0,Date,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventRootCode,QuadClass,GoldsteinScale,NumSources,NumArticles,AvgTone,Source
0,2019-01-01,,,CVL,,,,,5,1,3.4,4,50,2.354384,https://telegrafi.com/ne-shenj-proteste-labino...
1,2019-01-01,MDV,US,,USA,US,,US,19,4,-10.0,2,12,-4.195804,http://www.wgow.com/news/manhunt-on-in-texas-f...
2,2019-01-01,,US,GOV,,US,GOV,US,2,1,3.2,10,20,1.734061,https://hanfordsentinel.com/news/national/govt...
3,2019-01-01,RUS,RS,,,RS,GOV,RS,6,2,6.0,3,30,-0.427182,http://www.gp.se/nyheter/v%C3%A4rlden/sju-d%C3...
4,2019-01-01,USA,US,,,,,US,2,1,3.0,8,23,-1.376241,http://www.790wpic.com/news/a-familys-whirlpoo...


In [42]:
gdelt.columns

Index(['Date', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry',
       'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumSources',
       'NumArticles', 'AvgTone', 'Source'],
      dtype='object')

In [43]:
# convert categorical columns to category type
gdelt.Actor1Country = gdelt.Actor1Country.astype('category')
gdelt.Actor1GeoCountry = gdelt.Actor1GeoCountry.astype('category')
gdelt.Actor1Type = gdelt.Actor1Type.astype('category')
gdelt.Actor2Country = gdelt.Actor2Country.astype('category')
gdelt.Actor2GeoCountry = gdelt.Actor2GeoCountry.astype('category')
gdelt.Actor2Type = gdelt.Actor2Type.astype('category')
gdelt.ActionCountry = gdelt.ActionCountry.astype('category')
gdelt.EventRootCode = gdelt.EventRootCode.astype('category')
gdelt.QuadClass = gdelt.QuadClass.astype('category')

In [44]:
# datetime to unix timestamp
gdelt['Timestamp'] = gdelt.Date.astype('int64') // 10**9

In [45]:
gdelt.dtypes

Date                datetime64[ns]
Actor1Country             category
Actor1GeoCountry          category
Actor1Type                category
Actor2Country             category
Actor2GeoCountry          category
Actor2Type                category
ActionCountry             category
EventRootCode             category
QuadClass                 category
GoldsteinScale             float64
NumSources                   int64
NumArticles                  int64
AvgTone                    float64
Source                      object
Timestamp                    int64
dtype: object

In [46]:
# replace quadclass nums with meaningful names
gdelt['QuadClass'] = gdelt['QuadClass'].map({
    1: 'VerbalCoop', 
    2: 'MaterialCoop', 
    3: 'VerbalConf', 
    4: 'MaterialConf'
})

In [47]:
# replace event root code nums with meaningful names and rename to EventType

# first convert to numeric
gdelt['EventRootCode'] = pd.to_numeric(gdelt['EventRootCode'], errors='coerce')
gdelt['EventRootCode'] = gdelt['EventRootCode'].map({
    1: 'Statement',
    2: 'Appeal',
    3: 'Intent to Coop',
    4: 'Consult',
    5: 'Diplom Coop',
    6: 'Material Coop',
    7: 'Aid',
    8: 'Yield',
    9: 'Investigate',
    10: 'Demand',
    11: 'Disapprove',
    12: 'Reject',
    13: 'Threaten',
    14: 'Protest',
    15: 'Demonstrate Force',
    16: 'Reduce Relations',
    17: 'Coerce',
    18: 'Assault',
    19: 'Fight',
    20: 'Mass Violence'
})
gdelt.rename(columns={'EventRootCode': 'EventType'}, inplace=True)

# convert back to category
gdelt.EventType = gdelt.EventType.astype('category')

In [48]:
# there are 3 NaN values in EventType, so we can drop them
gdelt['EventType'].value_counts(dropna=False)[::-1]

EventType
NaN                        3
Mass Violence           1609
Demonstrate Force      20461
Protest                59635
Reduce Relations       60881
Demand                 69028
Threaten               80823
Assault                81196
Material Coop         106830
Investigate           136693
Reject                146954
Aid                   174983
Yield                 202861
Coerce                255777
Disapprove            305588
Fight                 345688
Intent to Coop        368722
Appeal                418795
Diplom Coop           419667
Statement             648289
Consult              1267084
Name: count, dtype: int64

In [49]:
gdelt.dropna(subset=['EventType'], inplace=True)

In [50]:
# generalize actor types
actor_types = {
    'GOV': ['GOV', 'COP', 'MIL', 'JUD', 'SPY', 'ELI', 'LEG'],
    'Opposition': ['OPP', 'INS', 'REB', 'SEP', 'CRM'],
    'Business': ['BUS', 'AGR', 'DEV', 'LAB'],
    'Benevolent': ['EDU', 'MED', 'HRI', 'HLH', 'ENV'],
}

def generalize_actor_type(actor_type):
    for k, v in actor_types.items():
        if actor_type in v:
            return k
    return 'Other'

In [51]:
gdelt['Actor1TypeGeneral'] = gdelt['Actor1Type'].apply(generalize_actor_type)
gdelt['Actor2TypeGeneral'] = gdelt['Actor2Type'].apply(generalize_actor_type)

In [52]:
gdelt['Actor1TypeGeneral'].value_counts()

Actor1TypeGeneral
GOV           1582931
Other          364790
Benevolent     338947
Business       291860
Opposition     125044
Name: count, dtype: int64

In [53]:
gdelt['Actor2TypeGeneral'].value_counts()

Actor2TypeGeneral
GOV           972947
Other         293756
Benevolent    258098
Business      205176
Opposition     99977
Name: count, dtype: int64

In [54]:
# we can see that GoldsteinScale has 23 NaN values
# this is a very small number, so we can drop them
gdelt[gdelt['GoldsteinScale'].isna()].__len__()

23

In [55]:
gdelt.dropna(subset=['GoldsteinScale'], inplace=True)

In [56]:
# # replace countries not in the top 20 with 'Other'
# top_countries = gdelt['Actor1Country'].value_counts().head(20).index
# gdelt['Actor1Country'] = gdelt['Actor1Country'].apply(lambda x: x if x in top_countries else 'Other')
# gdelt['Actor2Country'] = gdelt['Actor2Country'].apply(lambda x: x if x in top_countries else 'Other')

In [57]:
# # replace geo countries not in the top 20 with 'Other'
# top_geo_countries = gdelt['Actor1GeoCountry'].value_counts().head(20).index
# gdelt['Actor1GeoCountry'] = gdelt['Actor1GeoCountry'].apply(lambda x: x if x in top_geo_countries else 'Other')
# gdelt['Actor2GeoCountry'] = gdelt['Actor2GeoCountry'].apply(lambda x: x if x in top_geo_countries else 'Other')

In [58]:
# # replace action countries not in the top 20 with 'Other'
# top_action_countries = gdelt['ActionCountry'].value_counts().head(20).index
# gdelt['ActionCountry'] = gdelt['ActionCountry'].apply(lambda x: x if x in top_action_countries else 'Other')

In [59]:
gdelt[gdelt.NumArticles > 500].shape[0]

99

There are only 99 events with more than 500 articles, so we can cap the number of articles at 500 to avoid outliers.

In [60]:
# cap number of articles at 500
gdelt['NumArticles'] = gdelt['NumArticles'].clip(upper=500)

In [61]:
# reorder columns
gdelt = gdelt[['Date', 'Timestamp',  # Date
    'Actor1Country', 'Actor1GeoCountry', 'Actor1Type', 'Actor1TypeGeneral',  # Actor1
    'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'Actor2TypeGeneral',  # Actor2
    'ActionCountry', 'EventType', 'QuadClass', 'GoldsteinScale',  # Event
    'NumSources', 'NumArticles', 'AvgTone', 'Source']]  # Reaction

In [62]:
from funcs.advanced_describe import advanced_describe

advanced_describe(gdelt, drop_cols=['Date', 'Timestamp', 'Source'])

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std,present,share,dtype,sample
Actor1Country,2321230.0,218.0,USA,497531.0,,,,,,,,44%,9.62%,category,SSD
Actor1GeoCountry,4303791.0,247.0,US,863494.0,,,,,,,,83%,16.70%,category,SU
Actor1Type,2703561.0,32.0,GOV,851770.0,,,,,,,,52%,16.47%,category,
Actor1TypeGeneral,2703561.0,5.0,GOV,1582925.0,,,,,,,,52%,30.61%,object,
Actor2Country,1693312.0,218.0,USA,321260.0,,,,,,,,32%,6.21%,category,UGA
Actor2GeoCountry,3040951.0,247.0,US,593587.0,,,,,,,,58%,11.48%,category,UG
Actor2Type,1829951.0,32.0,GOV,525056.0,,,,,,,,35%,10.15%,category,
Actor2TypeGeneral,1829951.0,5.0,GOV,972944.0,,,,,,,,35%,18.81%,object,
ActionCountry,4861692.0,249.0,US,971085.0,,,,,,,,94%,18.78%,category,UG
EventType,5171541.0,20.0,Consult,1267084.0,,,,,,,,100%,24.50%,category,Material Coop


In [63]:
gdelt.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor1TypeGeneral,Actor2Country,Actor2GeoCountry,Actor2Type,Actor2TypeGeneral,ActionCountry,EventType,QuadClass,GoldsteinScale,NumSources,NumArticles,AvgTone,Source
0,2019-01-01,1546300800,,,CVL,Other,,,,,,Diplom Coop,VerbalCoop,3.4,4,50,2.354384,https://telegrafi.com/ne-shenj-proteste-labino...
1,2019-01-01,1546300800,MDV,US,,,USA,US,,,US,Fight,MaterialConf,-10.0,2,12,-4.195804,http://www.wgow.com/news/manhunt-on-in-texas-f...
2,2019-01-01,1546300800,,US,GOV,GOV,,US,GOV,GOV,US,Appeal,VerbalCoop,3.2,10,20,1.734061,https://hanfordsentinel.com/news/national/govt...
3,2019-01-01,1546300800,RUS,RS,,,,RS,GOV,GOV,RS,Material Coop,MaterialCoop,6.0,3,30,-0.427182,http://www.gp.se/nyheter/v%C3%A4rlden/sju-d%C3...
4,2019-01-01,1546300800,USA,US,,,,,,,US,Appeal,VerbalCoop,3.0,8,23,-1.376241,http://www.790wpic.com/news/a-familys-whirlpoo...


After analyzing impact of different features on btc price, we can drop some columns and remove some less frequent values.

In [64]:
gdelt.columns

Index(['Date', 'Timestamp', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor1TypeGeneral', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type',
       'Actor2TypeGeneral', 'ActionCountry', 'EventType', 'QuadClass',
       'GoldsteinScale', 'NumSources', 'NumArticles', 'AvgTone', 'Source'],
      dtype='object')

In [65]:
gdelt = gdelt.drop(columns=['Source', 'Actor1TypeGeneral', 'Actor2TypeGeneral', 'QuadClass'])

In [66]:
gdelt.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,GoldsteinScale,NumSources,NumArticles,AvgTone
0,2019-01-01,1546300800,,,CVL,,,,,Diplom Coop,3.4,4,50,2.354384
1,2019-01-01,1546300800,MDV,US,,USA,US,,US,Fight,-10.0,2,12,-4.195804
2,2019-01-01,1546300800,,US,GOV,,US,GOV,US,Appeal,3.2,10,20,1.734061
3,2019-01-01,1546300800,RUS,RS,,,RS,GOV,RS,Material Coop,6.0,3,30,-0.427182
4,2019-01-01,1546300800,USA,US,,,,,US,Appeal,3.0,8,23,-1.376241


In [67]:
# save cleaned gdelt
gdelt.to_parquet('../cache/gdelt_cleaned.parquet')