In [1]:
import pandas as pd

merged_0q = pd.read_parquet('../cache/merged_0q.parquet')
merged_50q = pd.read_parquet('../cache/merged_50q.parquet')
merged_90q = pd.read_parquet('../cache/merged_90q.parquet')
merged_99q = pd.read_parquet('../cache/merged_99q.parquet')

In [2]:
merged_0q.dtypes

Date                datetime64[ns]
Timestamp                    int64
Actor1Country               object
Actor1GeoCountry            object
Actor1Type                  object
Actor2Country               object
Actor2GeoCountry            object
Actor2Type                  object
ActionCountry               object
EventType                 category
GoldsteinScale             float64
NumSources                   int64
NumArticles                  int64
AvgTone                    float64
Magnitude                  float64
Impact                     float64
Impact_bin                  object
pct_change_15min           float64
pct_change_30min           float64
pct_change_24h             float64
AbsChange                  float64
dtype: object

In [3]:
merged_0q.columns

Index(['Date', 'Timestamp', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry',
       'EventType', 'GoldsteinScale', 'NumSources', 'NumArticles', 'AvgTone',
       'Magnitude', 'Impact', 'Impact_bin', 'pct_change_15min',
       'pct_change_30min', 'pct_change_24h', 'AbsChange'],
      dtype='object')

In [4]:
numerical_columns = merged_0q.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_columns:
    print(col, merged_0q[col].nunique())

Timestamp 171801
GoldsteinScale 42
NumSources 50
NumArticles 354
AvgTone 2354698
Magnitude 2951
Impact 4441
pct_change_15min 169341
pct_change_30min 169919
pct_change_24h 171440
AbsChange 168911


In [5]:
categorical_columns = merged_0q.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    print(col, merged_0q[col].nunique())

Actor1Country 9
Actor1GeoCountry 9
Actor1Type 9
Actor2Country 9
Actor2GeoCountry 9
Actor2Type 9
ActionCountry 9
EventType 9
Impact_bin 7


In [6]:
# label encode the categorical columns
from sklearn.preprocessing import LabelEncoder

columns_to_encode =['Actor1Country', 'Actor1GeoCountry', 'Actor1Type', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry', 'EventType']
merged_0q_encoded = merged_0q.copy()
merged_50q_encoded = merged_50q.copy()
merged_90q_encoded = merged_90q.copy()
merged_99q_encoded = merged_99q.copy()
le = LabelEncoder()

# Combine all data for fitting the LabelEncoder
combined_data = pd.concat([merged_0q, merged_50q, merged_90q, merged_99q])

for col in columns_to_encode:
    le.fit(combined_data[col])
    merged_0q_encoded[col] = le.transform(merged_0q[col])
    merged_50q_encoded[col] = le.transform(merged_50q[col])
    merged_90q_encoded[col] = le.transform(merged_90q[col])
    merged_99q_encoded[col] = le.transform(merged_99q[col])
    

In [7]:
merged_0q_encoded.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,...,NumSources,NumArticles,AvgTone,Magnitude,Impact,Impact_bin,pct_change_15min,pct_change_30min,pct_change_24h,AbsChange
0,2019-01-01,1546300800,9,13,9,13,11,9,11,10,...,4,50,2.354384,6.85,2.33,Positive,-0.033061,-0.226363,-2.433464,0.226363
1,2019-01-01,1546300800,9,13,9,13,11,9,11,10,...,3,30,-0.339542,3.34,1.34,Slightly Positive,-0.033061,-0.226363,-2.433464,0.226363
2,2019-01-01,1546300800,9,20,9,13,11,9,20,1,...,10,40,1.734061,8.73,2.79,Positive,-0.033061,-0.226363,-2.433464,0.226363
3,2019-01-01,1546300800,9,20,9,13,11,9,20,10,...,3,21,-4.170072,6.72,-1.34,Slightly Negative,-0.033061,-0.226363,-2.433464,0.226363
4,2019-01-01,1546300800,9,20,9,13,11,9,20,10,...,6,18,1.415701,5.32,1.01,Slightly Positive,-0.033061,-0.226363,-2.433464,0.226363


In [8]:
# change the index to datetime
merged_0q_encoded.set_index('Date', inplace=True)
merged_50q_encoded.set_index('Date', inplace=True)
merged_90q_encoded.set_index('Date', inplace=True)
merged_99q_encoded.set_index('Date', inplace=True)
merged_0q_encoded.head()

Unnamed: 0_level_0,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,GoldsteinScale,NumSources,NumArticles,AvgTone,Magnitude,Impact,Impact_bin,pct_change_15min,pct_change_30min,pct_change_24h,AbsChange
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-01,1546300800,9,13,9,13,11,9,11,10,3.4,4,50,2.354384,6.85,2.33,Positive,-0.033061,-0.226363,-2.433464,0.226363
2019-01-01,1546300800,9,13,9,13,11,9,11,10,4.0,3,30,-0.339542,3.34,1.34,Slightly Positive,-0.033061,-0.226363,-2.433464,0.226363
2019-01-01,1546300800,9,20,9,13,11,9,20,1,3.2,10,40,1.734061,8.73,2.79,Positive,-0.033061,-0.226363,-2.433464,0.226363
2019-01-01,1546300800,9,20,9,13,11,9,20,10,-2.0,3,21,-4.170072,6.72,-1.34,Slightly Negative,-0.033061,-0.226363,-2.433464,0.226363
2019-01-01,1546300800,9,20,9,13,11,9,20,10,1.9,6,18,1.415701,5.32,1.01,Slightly Positive,-0.033061,-0.226363,-2.433464,0.226363


In [9]:
merged_0q_encoded.dtypes

Timestamp             int64
Actor1Country         int64
Actor1GeoCountry      int64
Actor1Type            int64
Actor2Country         int64
Actor2GeoCountry      int64
Actor2Type            int64
ActionCountry         int64
EventType             int64
GoldsteinScale      float64
NumSources            int64
NumArticles           int64
AvgTone             float64
Magnitude           float64
Impact              float64
Impact_bin           object
pct_change_15min    float64
pct_change_30min    float64
pct_change_24h      float64
AbsChange           float64
dtype: object

In [10]:
# convert bool columns to int (currently not needed)
# bool_columns = merged_encoded.select_dtypes(include='bool').columns
# merged_encoded[bool_columns] = merged_encoded[bool_columns].astype(int)

In [11]:
merged_0q_encoded.to_parquet("../cache/encoded_0q.parquet")
merged_50q_encoded.to_parquet("../cache/encoded_50q.parquet")
merged_90q_encoded.to_parquet("../cache/encoded_90q.parquet")
merged_99q_encoded.to_parquet("../cache/encoded_99q.parquet")

In [12]:
# now use standard scaling to scale the numerical columns
from sklearn.preprocessing import StandardScaler

# Exclude the pct_change columns
columns_to_exclude = ['pct_change_15min', 'pct_change_30min', 'pct_change_24h']
numerical_columns_to_scale = [col for col in numerical_columns if col not in columns_to_exclude]

# Fit the scaler on the combined data
scaler = StandardScaler()
scaler.fit(combined_data[numerical_columns_to_scale])

merged_0q_encoded[numerical_columns_to_scale] = scaler.transform(merged_0q_encoded[numerical_columns_to_scale])
merged_50q_encoded[numerical_columns_to_scale] = scaler.transform(merged_50q_encoded[numerical_columns_to_scale])
merged_90q_encoded[numerical_columns_to_scale] = scaler.transform(merged_90q_encoded[numerical_columns_to_scale])
merged_99q_encoded[numerical_columns_to_scale] = scaler.transform(merged_99q_encoded[numerical_columns_to_scale])

In [14]:
merged_0q_encoded.to_parquet("../cache/encoded_0q_scaled.parquet")
merged_50q_encoded.to_parquet("../cache/encoded_50q_scaled.parquet")
merged_90q_encoded.to_parquet("../cache/encoded_90q_scaled.parquet")
merged_99q_encoded.to_parquet("../cache/encoded_99q_scaled.parquet")