In [1]:
import pandas as pd

merged = pd.read_parquet('../cache/merged.parquet')

In [2]:
merged.dtypes

Date                 datetime64[ns]
Timestamp                     int64
Actor1Country                object
Actor1GeoCountry             object
Actor1Type                 category
Actor1TypeGeneral            object
Actor2Country                object
Actor2GeoCountry             object
Actor2Type                 category
Actor2TypeGeneral            object
ActionCountry                object
EventType                  category
QuadClass                  category
GoldsteinScale              float64
NumSources                    int64
NumArticles                   int64
AvgTone                     float64
Source                       object
pct_change_15min            float64
pct_change_30min            float64
pct_change_24h              float64
dtype: object

In [3]:
merged.columns

Index(['Date', 'Timestamp', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor1TypeGeneral', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type',
       'Actor2TypeGeneral', 'ActionCountry', 'EventType', 'QuadClass',
       'GoldsteinScale', 'NumSources', 'NumArticles', 'AvgTone', 'Source',
       'pct_change_15min', 'pct_change_30min', 'pct_change_24h'],
      dtype='object')

In [4]:
numerical_columns = merged.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_columns:
    print(col, merged[col].nunique())

Timestamp 171801
GoldsteinScale 42
NumSources 50
NumArticles 415
AvgTone 2354698
pct_change_15min 169341
pct_change_30min 169919
pct_change_24h 171440


In [5]:
categorical_columns = merged.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    print(col, merged[col].nunique())

Actor1Country 21
Actor1GeoCountry 21
Actor1Type 32
Actor1TypeGeneral 5
Actor2Country 21
Actor2GeoCountry 21
Actor2Type 32
Actor2TypeGeneral 5
ActionCountry 21
EventType 20
QuadClass 4
Source 3225243


In [6]:
# one hot encode the categorical columns
merged_encoded = pd.get_dummies(merged, columns=['QuadClass'])

In [7]:
# convert bool columns to int
bool_columns = merged_encoded.select_dtypes(include='bool').columns
merged_encoded[bool_columns] = merged_encoded[bool_columns].astype(int)

In [8]:
# label encode the categorical columns
from sklearn.preprocessing import LabelEncoder

columns_to_encode =['Actor1Country', 'Actor1GeoCountry', 'Actor1Type', 'Actor1TypeGeneral', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type','Actor2TypeGeneral', 'ActionCountry', 'EventType']

le = LabelEncoder()
for col in columns_to_encode:
    merged_encoded[col] = le.fit_transform(merged_encoded[col])

In [9]:
merged_encoded.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor1TypeGeneral,Actor2Country,Actor2GeoCountry,Actor2Type,Actor2TypeGeneral,...,NumArticles,AvgTone,Source,pct_change_15min,pct_change_30min,pct_change_24h,QuadClass_VerbalCoop,QuadClass_MaterialCoop,QuadClass_VerbalConf,QuadClass_MaterialConf
0,2019-01-01,1546300800,21,21,4,4,21,21,32,5,...,50,2.354384,https://telegrafi.com/ne-shenj-proteste-labino...,-0.033061,-0.226363,-2.433464,1,0,0,0
1,2019-01-01,1546300800,16,1,32,5,21,21,32,5,...,30,-0.339542,http://www.jornaldeluzilandia.com.br/txt.php?i...,-0.033061,-0.226363,-2.433464,1,0,0,0
2,2019-01-01,1546300800,21,20,9,2,21,20,9,2,...,40,1.734061,https://hanfordsentinel.com/news/national/govt...,-0.033061,-0.226363,-2.433464,1,0,0,0
3,2019-01-01,1546300800,20,20,32,5,21,21,32,5,...,21,-4.170072,http://midutahradio.com/news/national-news/lou...,-0.033061,-0.226363,-2.433464,0,0,1,0
4,2019-01-01,1546300800,20,20,6,0,21,21,32,5,...,18,1.415701,http://www.q106dot5.com/news/blind-baker-launc...,-0.033061,-0.226363,-2.433464,1,0,0,0


In [10]:
merged_encoded.dtypes

Date                      datetime64[ns]
Timestamp                          int64
Actor1Country                      int64
Actor1GeoCountry                   int64
Actor1Type                         int64
Actor1TypeGeneral                  int64
Actor2Country                      int64
Actor2GeoCountry                   int64
Actor2Type                         int64
Actor2TypeGeneral                  int64
ActionCountry                      int64
EventType                          int64
GoldsteinScale                   float64
NumSources                         int64
NumArticles                        int64
AvgTone                          float64
Source                            object
pct_change_15min                 float64
pct_change_30min                 float64
pct_change_24h                   float64
QuadClass_VerbalCoop               int64
QuadClass_MaterialCoop             int64
QuadClass_VerbalConf               int64
QuadClass_MaterialConf             int64
dtype: object

In [11]:
merged_encoded.to_parquet("../cache/encoded.parquet")