In [47]:
import pandas as pd

merged_0q = pd.read_parquet('../cache/merged_0q.parquet')
merged_50q = pd.read_parquet('../cache/merged_50q.parquet')
merged_90q = pd.read_parquet('../cache/merged_90q.parquet')
merged_99q = pd.read_parquet('../cache/merged_99q.parquet')

In [48]:
merged_0q.dtypes

Date                datetime64[ns]
Timestamp                    int64
Actor1Country               object
Actor1GeoCountry            object
Actor1Type                  object
Actor2Country               object
Actor2GeoCountry            object
Actor2Type                  object
ActionCountry               object
EventType                 category
GoldsteinScale             float64
NumSources                   int64
NumArticles                  int64
AvgTone                    float64
Magnitude                  float64
Impact                     float64
Impact_bin                  object
pct_change_15min           float64
pct_change_30min           float64
pct_change_24h             float64
AbsChange                  float64
dtype: object

In [49]:
merged_0q.columns

Index(['Date', 'Timestamp', 'Actor1Country', 'Actor1GeoCountry', 'Actor1Type',
       'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry',
       'EventType', 'GoldsteinScale', 'NumSources', 'NumArticles', 'AvgTone',
       'Magnitude', 'Impact', 'Impact_bin', 'pct_change_15min',
       'pct_change_30min', 'pct_change_24h', 'AbsChange'],
      dtype='object')

In [50]:
numerical_columns = merged_0q.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_columns:
    print(col, merged_0q[col].nunique())

Timestamp 171801
GoldsteinScale 42
NumSources 30
NumArticles 203
AvgTone 136291
Magnitude 2265
Impact 3408
pct_change_15min 169341
pct_change_30min 169919
pct_change_24h 171440
AbsChange 168911


In [51]:
categorical_columns = merged_0q.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    print(col, merged_0q[col].nunique())

Actor1Country 9
Actor1GeoCountry 9
Actor1Type 8
Actor2Country 9
Actor2GeoCountry 9
Actor2Type 8
ActionCountry 9
EventType 9
Impact_bin 7


In [52]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
def prep_data(df,
              catg_cols=('Actor1Country', 'Actor1GeoCountry', 'Actor1Type', 'Actor2Country', 'Actor2GeoCountry', 'Actor2Type', 'ActionCountry', 'EventType'),
              num_cols=('AvgTone', 'GoldsteinScale', 'NumSources', 'NumArticles')):
    encoders = {}
    for catg_col in catg_cols:
        encoders[catg_col] = LabelEncoder()
        df[f'{catg_col}_enc'] = encoders[catg_col].fit_transform(df[catg_col])
    
    num_cols = list(num_cols) + [f'{feature}_enc' for feature in catg_cols]
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df, encoders, scaler, num_cols

In [53]:
df_0q_encoded, encoders_0q, scaler_0q, num_cols_0q = prep_data(merged_0q)
df_50q_encoded, encoders_50q, scaler_50q, num_cols_50q = prep_data(merged_50q)
df_90q_encoded, encoders_90q, scaler_90q, num_cols_90q = prep_data(merged_90q)
df_99q_encoded, encoders_99q, scaler_99q, num_cols_99q = prep_data(merged_99q)

In [54]:
df_0q_encoded.head()

Unnamed: 0,Date,Timestamp,Actor1Country,Actor1GeoCountry,Actor1Type,Actor2Country,Actor2GeoCountry,Actor2Type,ActionCountry,EventType,...,pct_change_24h,AbsChange,Actor1Country_enc,Actor1GeoCountry_enc,Actor1Type_enc,Actor2Country_enc,Actor2GeoCountry_enc,Actor2Type_enc,ActionCountry_enc,EventType_enc
0,2019-01-01 00:00:00,1546300800,Other,AS,MIL,Other,AS,Other,AS,Other,...,-2.433464,0.226363,0.127864,-1.4341,-1.578934,0.164392,-1.784813,0.190843,-1.409427,0.795094
1,2019-01-01 00:15:00,1546301700,Other,US,Other,Other,Other,Other,US,Fight,...,-1.891779,2.57856,0.127864,0.969019,0.218327,0.164392,0.209139,0.190843,0.956949,-0.027327
2,2019-01-01 00:30:00,1546302600,Other,US,Other,Other,Other,Other,US,Fight,...,-1.775994,1.509301,0.127864,0.969019,0.218327,0.164392,0.209139,0.190843,0.956949,-0.027327
3,2019-01-01 00:45:00,1546303500,Other,AJ,Other,Other,Other,Other,AJ,Other,...,0.895776,0.781614,0.127864,-1.83462,0.218327,0.164392,0.209139,0.190843,-1.803824,0.795094
4,2019-01-01 01:00:00,1546304400,Other,PE,UAF,Other,PE,Other,PE,Fight,...,0.356518,0.291943,0.127864,-0.23254,2.914218,0.164392,0.87379,0.190843,-0.226239,-0.027327


In [55]:
dfs = [df_0q_encoded, df_50q_encoded, df_90q_encoded, df_99q_encoded]
num_cols = [num_cols_0q, num_cols_50q, num_cols_90q, num_cols_99q]
for i, df in enumerate(dfs):
    df.set_index('Date', inplace=True)
    # keep only numerical + btc price columns
    df = df[num_cols[i] + ['pct_change_30min']]
    dfs[i] = df
df_0q, df_50q, df_90q, df_99q = dfs

In [56]:
df_0q.head()

Unnamed: 0_level_0,AvgTone,GoldsteinScale,NumSources,NumArticles,Actor1Country_enc,Actor1GeoCountry_enc,Actor1Type_enc,Actor2Country_enc,Actor2GeoCountry_enc,Actor2Type_enc,ActionCountry_enc,EventType_enc,pct_change_30min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-01 00:00:00,-1.152286,-0.771788,-0.184615,-0.113108,0.127864,-1.4341,-1.578934,0.164392,-1.784813,0.190843,-1.409427,0.795094,-0.226363
2019-01-01 00:15:00,0.716626,-1.277835,3.176668,3.491856,0.127864,0.969019,0.218327,0.164392,0.209139,0.190843,0.956949,-0.027327,-2.57856
2019-01-01 00:30:00,-0.275271,-1.277835,4.297096,2.290202,0.127864,0.969019,0.218327,0.164392,0.209139,0.190843,0.956949,-0.027327,-1.509301
2019-01-01 00:45:00,3.460546,0.982507,-0.744829,-0.353439,0.127864,-1.83462,0.218327,0.164392,0.209139,0.190843,-1.803824,0.795094,0.781614
2019-01-01 01:00:00,0.350195,-1.277835,-0.184615,-0.113108,0.127864,-0.23254,2.914218,0.164392,0.87379,0.190843,-0.226239,-0.027327,-0.291943


In [57]:
# convert bool columns to int (currently not needed)
# bool_columns = merged_encoded.select_dtypes(include='bool').columns
# merged_encoded[bool_columns] = merged_encoded[bool_columns].astype(int)

In [58]:
df_0q.to_parquet('../cache/prepd_0q.parquet')
df_50q.to_parquet('../cache/prepd_50q.parquet')
df_90q.to_parquet('../cache/prepd_90q.parquet')
df_99q.to_parquet('../cache/prepd_99q.parquet')