In [12]:
import pandas as pd
from pyexpat import features

data = pd.read_parquet('../cache/prepd_99q.parquet')
data = data.resample('15min').ffill()
data.head()

Unnamed: 0_level_0,AvgTone,GoldsteinScale,NumSources,NumArticles,Actor1Country_enc,Actor1GeoCountry_enc,Actor1Type_enc,Actor2Country_enc,Actor2GeoCountry_enc,Actor2Type_enc,ActionCountry_enc,EventType_enc,pct_change_30min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-01 00:00:00,-0.175964,1.817205,0.521607,-0.24779,0.409021,1.086753,-0.032773,2.829239,-1.613906,0.050958,-0.726077,1.55327,-0.226363
2019-01-01 00:15:00,0.349194,-0.278203,-0.074356,-0.121287,0.409021,0.440374,-0.032773,0.106576,0.048803,0.050958,0.401003,0.144686,-2.57856
2019-01-01 00:30:00,-0.290151,0.245649,4.097383,4.306286,-2.322466,-0.529194,-0.032773,0.106576,0.60304,0.050958,1.15239,0.144686,-1.509301
2019-01-01 00:45:00,3.491408,0.66473,-0.670319,-0.374292,0.409021,1.086753,-0.032773,0.106576,0.60304,0.050958,1.15239,0.144686,0.781614
2019-01-01 01:00:00,-0.094624,-0.627438,-0.074356,-0.121287,-1.639595,-0.852384,-0.032773,0.106576,-1.613906,0.050958,-0.726077,0.614214,-0.291943


In [13]:
data.dtypes

AvgTone                 float64
GoldsteinScale          float64
NumSources              float64
NumArticles             float64
Actor1Country_enc       float64
Actor1GeoCountry_enc    float64
Actor1Type_enc          float64
Actor2Country_enc       float64
Actor2GeoCountry_enc    float64
Actor2Type_enc          float64
ActionCountry_enc       float64
EventType_enc           float64
pct_change_30min        float64
dtype: object

In [14]:
# Split the data into train and test sets
train_data = data[data.index.year < 2023]
test_data = data[data.index.year == 2023]

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
smol_sample = train_data.tail(1000)
sample = train_data.tail(10000)
big_sample = train_data.tail(100000)
sample.head()

Train data shape: (140256, 13)
Test data shape: (35040, 13)


Unnamed: 0_level_0,AvgTone,GoldsteinScale,NumSources,NumArticles,Actor1Country_enc,Actor1GeoCountry_enc,Actor1Type_enc,Actor2Country_enc,Actor2GeoCountry_enc,Actor2Type_enc,ActionCountry_enc,EventType_enc,pct_change_30min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-09-18 20:00:00,-1.012547,0.402804,-0.074356,-0.121287,0.409021,1.086753,-0.032773,3.736793,0.60304,0.050958,1.15239,0.144686,-0.469389
2022-09-18 20:15:00,-0.766602,0.402804,-0.074356,-0.374292,0.409021,0.440374,-0.032773,0.106576,0.048803,0.050958,0.401003,0.144686,-0.035495
2022-09-18 20:30:00,-1.290785,-0.697285,-0.074356,-0.500794,0.409021,1.086753,-0.032773,0.106576,0.60304,0.050958,1.15239,1.083742,0.116633
2022-09-18 20:45:00,0.746363,-1.587833,-0.074356,-0.24779,0.409021,-1.498763,-0.032773,0.106576,0.60304,0.050958,-1.477464,0.144686,0.076088
2022-09-18 21:00:00,-0.460403,0.071031,-0.670319,3.35752,1.774765,1.086753,-0.032773,0.106576,0.60304,0.050958,1.15239,0.144686,-0.248189


In [15]:
# cache the train and test data for later use
train_data.to_parquet('../cache/train_data.parquet')
test_data.to_parquet('../cache/test_data.parquet')

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Quantifies multicollinearity
vif_data = pd.DataFrame()
vif_data["Variable"] = sample.columns
vif_data["VIF"] = [variance_inflation_factor(sample.values, i) for i in range(sample.shape[1])]
print(vif_data)

                Variable       VIF
0                AvgTone  1.164019
1         GoldsteinScale  1.144132
2             NumSources  1.273828
3            NumArticles  1.265210
4      Actor1Country_enc  1.091859
5   Actor1GeoCountry_enc  4.401708
6         Actor1Type_enc  1.000840
7      Actor2Country_enc  1.111933
8   Actor2GeoCountry_enc  1.253484
9         Actor2Type_enc  1.004233
10     ActionCountry_enc  4.880076
11         EventType_enc  1.011846
12      pct_change_30min  1.002047


### Vector Autoregression (VAR)

In [17]:
import statsmodels.api as sm
from statsmodels.tsa.api import VAR

model = VAR(big_sample)
var_results = model.fit(maxlags=15, ic='aic') # Fit with automatic lag order selection based on AIC
lag_order = var_results.k_ar
predictions = var_results.forecast(sample.values[-lag_order:], steps=5) # Forecast 5 steps ahead
print(predictions)

[[ 7.41276774e-02 -1.28082025e-02 -1.82844947e-02  2.18280824e-02
   1.55628355e-01  3.56357373e-01 -1.37551310e-01  3.68132543e-03
   3.14583855e-01 -4.76756085e-02  3.56971269e-01 -5.07486629e-02
   1.75447365e-02]
 [ 1.11402406e-01 -1.26989482e-02 -4.10293887e-02 -6.47403090e-02
   1.59366360e-01  3.20230539e-01 -7.09453096e-02  6.95373366e-02
   2.72969199e-01 -3.63998640e-02  2.79002035e-01 -7.01857964e-02
   7.24395133e-03]
 [ 1.02714844e-01  5.18051277e-02 -1.10914021e-02  2.77763788e-02
   8.13815984e-02  3.22290257e-01 -1.30063555e-01  4.07325564e-02
   2.60140545e-01  3.98114605e-02  3.11820307e-01  4.69552385e-02
  -2.92660711e-02]
 [ 8.32531396e-02  6.90739548e-02  2.48977013e-02  2.85212722e-02
   1.06687009e-01  2.87702993e-01 -5.87724141e-02  8.75276336e-02
   3.00819126e-01  2.23587379e-03  2.81302166e-01  2.03562420e-02
  -3.91126960e-03]
 [ 6.97706102e-02  7.25501045e-03 -2.47467244e-05  7.03147679e-04
   1.82068482e-01  2.91954456e-01 -1.48702277e-01 -2.68663774e-02


### dynamic factor model (DFM)

In [18]:
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor

# Fit a dynamic factor model
model = DynamicFactor(sample, k_factors=1, factor_order=1)
dfm_results = model.fit()
print(dfm_results.summary())

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           27     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.08964D+01    |proj g|=  4.53590D-01


 This problem is unconstrained.



At iterate    5    f=  2.03349D+01    |proj g|=  9.33671D-02

At iterate   10    f=  2.03301D+01    |proj g|=  9.80478D-03

At iterate   15    f=  2.03293D+01    |proj g|=  1.54732D-02

At iterate   20    f=  2.03207D+01    |proj g|=  2.72641D-01

At iterate   25    f=  2.02725D+01    |proj g|=  2.14405D-01

At iterate   30    f=  2.00167D+01    |proj g|=  1.84424D+00

At iterate   35    f=  1.96636D+01    |proj g|=  1.23606D-01

At iterate   40    f=  1.96444D+01    |proj g|=  3.35091D-02

At iterate   45    f=  1.96349D+01    |proj g|=  8.99590D-02





At iterate   50    f=  1.96269D+01    |proj g|=  6.41706D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   27     50     62      1     0     0   6.417D-02   1.963D+01
  F =   19.626856861096783     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
                                                                                                                                           Statespace Model Results                                                                                                                                           
Dep. Variable:     ['AvgTone', 'GoldsteinScale', 'NumSourc

In [19]:
forecast_steps = 5
forecast = dfm_results.get_forecast(steps=forecast_steps)
# Extract the predicted values from the forecast result
predicted_values = forecast.predicted_mean
# Or for the full prediction including uncertainty (confidence intervals)
prediction_conf_int = forecast.conf_int()

# Display the predicted values and the confidence intervals
print("Predicted Values:")
print(predicted_values)

print("\nConfidence Intervals for Predictions:")
print(prediction_conf_int)

Predicted Values:
                          AvgTone  GoldsteinScale    NumSources   NumArticles  \
2023-01-01 00:00:00  2.481199e-03    7.726475e-03 -1.292273e-02 -1.151570e-02   
2023-01-01 00:15:00  1.484251e-04    4.621969e-04 -7.730362e-04 -6.888681e-04   
2023-01-01 00:30:00  8.878773e-06    2.764857e-05 -4.624295e-05 -4.120802e-05   
2023-01-01 00:45:00  5.311273e-07    1.653935e-06 -2.766248e-06 -2.465059e-06   
2023-01-01 01:00:00  3.177198e-08    9.893820e-08 -1.654767e-07 -1.474596e-07   

                     Actor1Country_enc  Actor1GeoCountry_enc  Actor1Type_enc  \
2023-01-01 00:00:00      -8.280243e-03          6.867778e-02   -2.164264e-04   
2023-01-01 00:15:00      -4.953233e-04          4.108298e-03   -1.294660e-05   
2023-01-01 00:30:00      -2.963019e-05          2.457579e-04   -7.744645e-07   
2023-01-01 00:45:00      -1.772475e-06          1.470121e-05   -4.632838e-08   
2023-01-01 01:00:00      -1.060293e-07          8.794251e-07   -2.771359e-09   

              

### Vector Autoregression Moving-Average (VARMA)

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX

model = VARMAX(smol_sample, order=(1, 1))  # (p, q) - autoregressive and moving average orders
varmax_results = model.fit(disp=False)
predictions = varmax_results.forecast(steps=5)
print(predictions)

  warn('Estimation of VARMA(p,q) models is not generically robust,'


In [10]:
# cache the models for later use
import joblib
joblib.dump(var_results, '../cache/var_model.joblib')
joblib.dump(dfm_results, '../cache/dfm_model.joblib')
joblib.dump(varmax_results, '../cache/varmax_model.joblib')

['../cache/varmax_model.joblib']

In [11]:
# purely for testing
# Load the models from cache
var_results_loaded = joblib.load('../cache/var_model.joblib')
print("done")
var_results_loaded.forecast(sample.values[var_results_loaded.k_ar:], steps=5)

done


array([[ 7.41276774e-02, -1.28082025e-02, -1.82844947e-02,
         2.18280824e-02,  1.55628355e-01,  3.56357373e-01,
        -1.37551310e-01,  3.68132543e-03,  3.14583855e-01,
        -4.76756085e-02,  3.56971269e-01, -5.07486629e-02,
         1.75447365e-02],
       [ 1.11402406e-01, -1.26989482e-02, -4.10293887e-02,
        -6.47403090e-02,  1.59366360e-01,  3.20230539e-01,
        -7.09453096e-02,  6.95373366e-02,  2.72969199e-01,
        -3.63998640e-02,  2.79002035e-01, -7.01857964e-02,
         7.24395133e-03],
       [ 1.02714844e-01,  5.18051277e-02, -1.10914021e-02,
         2.77763788e-02,  8.13815984e-02,  3.22290257e-01,
        -1.30063555e-01,  4.07325564e-02,  2.60140545e-01,
         3.98114605e-02,  3.11820307e-01,  4.69552385e-02,
        -2.92660711e-02],
       [ 8.32531396e-02,  6.90739548e-02,  2.48977013e-02,
         2.85212722e-02,  1.06687009e-01,  2.87702993e-01,
        -5.87724141e-02,  8.75276336e-02,  3.00819126e-01,
         2.23587379e-03,  2.81302166e