In [None]:
We create tabular data based on some feature extracted from the observations
and fit a simple classifier to it (Random Forest). Using 1000 paths it seems do distinguish the following different
pairs of volatility coeff:
    sigma1 = 0.1, sigma2 = 0.8 with accuracy 0.96
    sigma1 = 0.2, sigma2 = 0.7 with accuray 0.89
    sigma1 = 0.3, sigma2 = 0.6 with accuracy 0.78
    sigma1 = 0.4, sigma2 = 0.5 with accuracy 0.45
    sigma1 = 0.3, sigma2 = 0.5 with accuracy 0.68
Has to be runned in the NJODE foder.

In [168]:
import torch
import tqdm
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import os, sys
import pandas as pd
import json
import time
import socket
import matplotlib
import matplotlib.colors
from torch.backends import cudnn
import gc
sys.path.append("../")
try:
    from . import models as models
    from . import data_utils as data_utils
    from ..GRU_ODE_Bayes import models_gru_ode_bayes as models_gru_ode_bayes
except Exception:
    import NJODE.models as models
    import NJODE.data_utils as data_utils
    import GRU_ODE_Bayes.models_gru_ode_bayes as models_gru_ode_bayes
import matplotlib.pyplot as plt
import stock_model as stock_model
from tqdm import tqdm

In [169]:
## Create mixed Dataset
#change the values of volatility for tests
stock_model_names = ('BlackScholes', 'BlackScholes')
hyperparam_vol_1 = {
    'drift': 2., 'volatility': 0.3, 'mean': 4,
    'speed': 2., 'correlation': 0.5, 'nb_paths': 1000, 'nb_steps': 100,
    'S0': 1, 'maturity': 1., 'dimension': 1, 
    'obs_perc': 0.1,
    'scheme': 'euler', 'return_vol': False, 'v0': 1,
}
hyperparam_vol_2 = {
    'drift': 2., 'volatility': 0.5, 'mean': 4,
    'speed': 2., 'correlation': 0.5, 'nb_paths': 1000, 'nb_steps': 100,
    'S0': 1, 'maturity': 1., 'dimension': 1, 
    'obs_perc': 0.1,
    'scheme': 'euler', 'return_vol': False, 'v0': 1,
}
path1, time_id1 = data_utils.create_dataset('BlackScholes',hyperparam_vol_1)
path2, time_id2 = data_utils.create_dataset('BlackScholes',hyperparam_vol_2)

In [170]:
dataset1 = 'BlackScholes'
dataset2 = 'BlackScholes'

data1 = data_utils.IrregularDataset(model_name=dataset1, time_id=time_id1)
data2 = data_utils.IrregularDataset(model_name=dataset2, time_id=time_id2)

dl1 = DataLoader(dataset = data1, collate_fn = data_utils.custom_collate_fn,shuffle  = False, batch_size = 1, num_workers = 1)
dl2 = DataLoader(dataset = data2, collate_fn = data_utils.custom_collate_fn,shuffle  = False, batch_size = 1, num_workers = 1)


In [171]:
#Iteration with dataloader is not needed, it would probably be faster to work with the whole dataset

data_tab = np.zeros((2*hyperparam_vol_1['nb_paths'],8))
for j,batch in enumerate(dl1):
        
        #plot one trajectory
        #plt.figure(0)
        #plt.plot(np.linspace(0,1,101),batch['true_paths'][0,0,:])
        
        
        #values of observed data
        X = batch['X']
        
        #1 if we observed the data at this point in time 0 o.w.
        observed_dates = batch['observed_dates'][0]
        
        #grid
        path_t_true_X = np.linspace(0,1,101)

        ## Add tabular features
        #index of the time point were we observe data
        ind = np.nonzero(observed_dates)
        
        #avoid cases with only 1 or 0 observation
        if not(X.size()[0]<=1):
            #jumps between the observation
            jumps = np.diff(np.squeeze(X))
            
            #index for the largest jump
            max_ind = np.where(jumps==max(jumps))[0]
            
            #difference in time for the largest jump
            max_time_diff  = path_t_true_X[ind][max_ind]-path_t_true_X[ind][max_ind-1]

            #same as above but for the smallest jump
            min_ind = np.where(jumps == min(jumps))[0]
            min_time_diff = path_t_true_X[ind][min_ind]-path_t_true_X[ind][min_ind-1]
            
            
            mean_jump = np.mean(jumps)
            
            mean_time_diff = np.mean(path_t_true_X[ind])
            
            approx_qv = np.sum(np.square(jumps))
            
            #add the feature generated above + rescale max and min with time diff (Large jump in short time may be more important than large jump in large time)
            data_tab[j,:] = [max(jumps),max(jumps)/max_time_diff,min(jumps),min(jumps)/min_time_diff,mean_jump,mean_time_diff,approx_qv,0]
        else:
            data_tab[j,:] = [0,0,0,0,0,0,0,0]
        

#as above but for dataset with volatility 2
for j,batch in enumerate(dl2):
        #plt.figure(1)
        #plt.plot(np.linspace(0,1,101),batch['true_paths'][0,0,:])
        X = batch['X']
        observed_dates = batch['observed_dates'][0]
        path_t_true_X = np.linspace(0,1,101)

        ## Add tabular features
        ind = np.nonzero(observed_dates)
        if not(X.size()[0]<=1):

            jumps = np.diff(np.squeeze(X))

            max_ind = np.where(jumps==max(jumps))[0]
            max_time_diff  = path_t_true_X[ind][max_ind]-path_t_true_X[ind][max_ind-1]


            min_ind = np.where(jumps == min(jumps))[0]
            min_time_diff = path_t_true_X[ind][min_ind]-path_t_true_X[ind][min_ind-1]

            mean_jump = np.mean(jumps)
            mean_time_diff = np.mean(path_t_true_X[ind])
            approx_qv = np.sum(np.square(jumps))

            data_tab[1000+j,:] = [max(jumps),max(jumps)/max_time_diff,min(jumps),min(jumps)/min_time_diff,mean_jump,mean_time_diff,approx_qv,1]
        else:
            data_tab[1000+j,:] = [0,0,0,0,0,0,0,0]
            
data_tab_df =  pd.DataFrame(data_tab)
data_tab_df.to_csv('./data_tab.csv')

In [172]:
from sklearn.ensemble import RandomForestClassifier
## Fit a simple classifier to the above data
X_train, X_test, y_train, y_test = train_test_split(data_tab[:,0:7],data_tab[:,7],test_size = 0.3,shuffle = True)
clf = RandomForestClassifier()

clf.fit(X_train,y_train)
accuracy = np.mean(clf.predict(X_test)==y_test)

print(accuracy)


0.68
