In [14]:
import pandas as pd

merged = pd.read_parquet('../cache/merged.parquet')

In [15]:
merged.dtypes

Date                datetime64[ns]
Timestamp                    int64
Actor1Country               object
Actor1GeoCountry            object
Actor1Type                  object
Actor2Country               object
Actor2GeoCountry            object
Actor2Type                  object
ActionCountry               object
EventType                   object
GoldsteinScale             float64
NumSources                   int64
NumArticles                  int64
AvgTone                    float64
Magnitude                  float64
Impact                     float64
Impact_bin                  object
pct_change_15min           float64
pct_change_30min           float64
pct_change_24h             float64
dtype: object

In [16]:
merged.columns

Index(['Date', 'Timestamp', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry',
       'EventType', 'GoldsteinScale', 'NumSources', 'NumArticles', 'AvgTone',
       'Magnitude', 'Impact', 'Impact_bin', 'pct_change_15min',
       'pct_change_30min', 'pct_change_24h'],
      dtype='object')

In [17]:
numerical_columns = merged.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_columns:
    print(col, merged[col].nunique())

Timestamp 171801
GoldsteinScale 42
NumSources 50
NumArticles 354
AvgTone 2354698
Magnitude 2951
Impact 4441
pct_change_15min 169341
pct_change_30min 169919
pct_change_24h 171440


In [18]:
categorical_columns = merged.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    print(col, merged[col].nunique())

Actor1Country 218
Actor1GeoCountry 247
Actor1Type 32
Actor2Country 218
Actor2GeoCountry 247
Actor2Type 32
ActionCountry 249
EventType 20
Impact_bin 7


In [19]:
# label encode the categorical columns
from sklearn.preprocessing import LabelEncoder

columns_to_encode =['Actor1Country', 'Actor1GeoCountry', 'Actor1Type', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry', 'EventType']
merged_encoded = merged.copy()
le = LabelEncoder()
for col in columns_to_encode:
    merged_encoded[col] = le.fit_transform(merged[col])

In [20]:
merged_encoded.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,GoldsteinScale,NumSources,NumArticles,AvgTone,Magnitude,Impact,Impact_bin,pct_change_15min,pct_change_30min,pct_change_24h
0,2019-01-01,1546300800,218,247,4,218,247,32,249,7,3.4,4,50,2.354384,6.85,2.33,Positive,-0.033061,-0.226363,-2.433464
1,2019-01-01,1546300800,27,30,32,218,247,32,30,10,4.0,3,30,-0.339542,3.34,1.34,Slightly Positive,-0.033061,-0.226363,-2.433464
2,2019-01-01,1546300800,218,228,9,218,227,9,229,1,3.2,10,40,1.734061,8.73,2.79,Positive,-0.033061,-0.226363,-2.433464
3,2019-01-01,1546300800,204,228,32,218,247,32,229,8,-2.0,3,21,-4.170072,6.72,-1.34,Slightly Negative,-0.033061,-0.226363,-2.433464
4,2019-01-01,1546300800,204,228,6,218,247,32,229,4,1.9,6,18,1.415701,5.32,1.01,Slightly Positive,-0.033061,-0.226363,-2.433464


In [21]:
# change the index to datetime
merged_encoded.set_index('Date', inplace=True)
merged_encoded.head()

Unnamed: 0_level_0,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,GoldsteinScale,NumSources,NumArticles,AvgTone,Magnitude,Impact,Impact_bin,pct_change_15min,pct_change_30min,pct_change_24h
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-01-01,1546300800,218,247,4,218,247,32,249,7,3.4,4,50,2.354384,6.85,2.33,Positive,-0.033061,-0.226363,-2.433464
2019-01-01,1546300800,27,30,32,218,247,32,30,10,4.0,3,30,-0.339542,3.34,1.34,Slightly Positive,-0.033061,-0.226363,-2.433464
2019-01-01,1546300800,218,228,9,218,227,9,229,1,3.2,10,40,1.734061,8.73,2.79,Positive,-0.033061,-0.226363,-2.433464
2019-01-01,1546300800,204,228,32,218,247,32,229,8,-2.0,3,21,-4.170072,6.72,-1.34,Slightly Negative,-0.033061,-0.226363,-2.433464
2019-01-01,1546300800,204,228,6,218,247,32,229,4,1.9,6,18,1.415701,5.32,1.01,Slightly Positive,-0.033061,-0.226363,-2.433464


In [22]:
merged_encoded.dtypes

Timestamp             int64
Actor1Country         int32
Actor1GeoCountry      int32
Actor1Type            int32
Actor2Country         int32
Actor2GeoCountry      int32
Actor2Type            int32
ActionCountry         int32
EventType             int32
GoldsteinScale      float64
NumSources            int64
NumArticles           int64
AvgTone             float64
Magnitude           float64
Impact              float64
Impact_bin           object
pct_change_15min    float64
pct_change_30min    float64
pct_change_24h      float64
dtype: object

In [23]:
# convert bool columns to int
bool_columns = merged_encoded.select_dtypes(include='bool').columns
merged_encoded[bool_columns] = merged_encoded[bool_columns].astype(int)

In [24]:
merged_encoded.to_parquet("../cache/encoded.parquet")

In [25]:
# now use standard scaling to scale the numerical columns
from sklearn.preprocessing import StandardScaler

# Exclude the pct_change columns
columns_to_exclude = ['pct_change_15min', 'pct_change_30min', 'pct_change_24h']
numerical_columns_to_scale = [col for col in numerical_columns if col not in columns_to_exclude]


scaler = StandardScaler()
merged_encoded[numerical_columns_to_scale] = scaler.fit_transform(merged_encoded[numerical_columns_to_scale])

In [26]:
merged_encoded.head()

Unnamed: 0_level_0,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,GoldsteinScale,NumSources,NumArticles,AvgTone,Magnitude,Impact,Impact_bin,pct_change_15min,pct_change_30min,pct_change_24h
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-01-01,-1.530752,218,247,4,218,247,32,249,7,0.605751,0.889228,2.004333,1.164239,0.375252,0.714246,Positive,-0.033061,-0.226363,-2.433464
2019-01-01,-1.530752,27,30,32,218,247,32,30,10,0.731167,0.36707,0.597142,0.468568,-0.71682,0.425568,Slightly Positive,-0.033061,-0.226363,-2.433464
2019-01-01,-1.530752,218,228,9,218,227,9,229,1,0.563945,4.022178,1.300737,1.004049,0.96018,0.848379,Positive,-0.033061,-0.226363,-2.433464
2019-01-01,-1.530752,204,228,32,218,247,32,229,8,-0.523,0.36707,-0.036094,-0.520616,0.334805,-0.355905,Slightly Negative,-0.033061,-0.226363,-2.433464
2019-01-01,-1.530752,204,228,6,218,247,32,229,4,0.292209,1.933545,-0.247173,0.921836,-0.100779,0.329342,Slightly Positive,-0.033061,-0.226363,-2.433464


In [27]:
merged_encoded.to_parquet("../cache/encoded_scaled.parquet")