In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import statsmodels.api as sm

from statsmodels.regression.linear_model import OLS

In [17]:
def df_OLS_anal(df: pd.DataFrame, mode = 0):
    if(mode == 0):
        deltaA = np.array(df["AskAfter"] - df["AskBefore"])[1:-1]
    else:
        deltaA = np.array(df["AskAfterMany"] - df["AskBeforeMany"])[1:-1]

    deltat = np.array(np.diff(df["Time"]))[1:]
    x = np.array(df["Volume"][1:-1])

    dA = np.array(deltaA / deltat)
    dx = np.array(x / deltat)
    ddx = np.array(np.diff(dx))
    ddA = np.array(np.diff(dA))

    OLSdf = pd.DataFrame({
        "SUM": ddA,
        "-rho": deltaA[:-1],
        "-rho kappa": x[1:],
        "rho (lambda + kappa)": x[:-1],
        "kappa + lambda": ddx
    })

    Regressand = OLSdf["SUM"]

    Regressors = OLSdf[["-rho", "-rho kappa", "rho (lambda + kappa)", "kappa + lambda"]]


    # Regressors = sm.add_constant(Regressors, prepend=False)

    model = OLS(Regressand, Regressors)

    res = model.fit()

    print(res.summary()) 

    rho = - res.params.iloc[0]
    kappa = - res.params.iloc[1] / rho
    lamb = res.params.iloc[2] / rho - kappa
    # lamb2 = res.params.iloc[3] - kappa
    

    print("rho = ", rho, "\nlambda = ", lamb, "\nkappa = ", kappa, "\n \n", end="")

    return rho, kappa, lamb


In [4]:
def df_OLS_anal2(df: pd.DataFrame, mode = 0):
    if(mode == 0):
        deltaA = np.array(df["AskAfter"] - df["AskBefore"])[1:-1]
    else:
        deltaA = np.array(df["AskAfterMany"] - df["AskBeforeMany"])[1:-1]
        
    deltat = np.array(np.diff(df["Time"]))[1:]
    x = np.array(df["Volume"][1:-1])

    dA = np.array(deltaA / deltat)
    dx = np.array(x / deltat)
    ddx = np.array(np.diff(dx))
    ddA = np.array(np.diff(dA))

    OLSdf = pd.DataFrame({
        "SUM": ddA,
        "-rho": deltaA[:-1],
        "-rho kappa": np.diff(x),
        "rho lambda": x[:-1],
        "kappa + lambda": ddx
    })

    Regressand = OLSdf["SUM"]

    Regressors = OLSdf[["-rho", "-rho kappa", "rho lambda", "kappa + lambda"]]


    # Regressors = sm.add_constant(Regressors, prepend=False)

    model = OLS(Regressand, Regressors)

    res = model.fit()

    print(res.summary()) 

    rho = - res.params.iloc[0]
    kappa = - res.params.iloc[1] / rho
    lamb = res.params.iloc[2] / rho - kappa
    # lamb2 = res.params.iloc[3] - kappa
    

    print("rho = ", rho, "\nlambda = ", lamb, "\nkappa = ", kappa, "\n \n", end="")

    return rho, kappa, lamb

In [5]:
def df_OLS_anal3(df: pd.DataFrame, mode = 0):
    if(mode == 0):
        deltaA = np.array(df["AskAfter"] - df["AskBefore"])[1:-1]
    else:
        deltaA = np.array(df["AskAfterMany"] - df["AskBeforeMany"])[1:-1]
        
    deltat = np.array(np.diff(df["Time"]))[1:]
    x = np.array(df["Volume"][1:-1])

    dA = np.array(deltaA / deltat)
    dx = np.array(x / deltat)
    ddx = np.array(np.diff(dx))
    ddA = np.array(np.diff(dA))

    dummy_first = np.array((df["Time"] < 3600)[1:-2], dtype=float)
    dummy_last = np.array((df["Time"] > 27500)[1:-2], dtype=float)

    OLSdf = pd.DataFrame({
        "SUM": ddA,
        "-rho": deltaA[:-1],
        "-rho kappa": x[1:],
        "rho (lambda + kappa)": x[:-1],
        "kappa + lambda": ddx,
        "first hour": dummy_first,
        "last hour": dummy_last
    })

    Regressand = OLSdf["SUM"]

    Regressors = OLSdf[["-rho", "-rho kappa", "rho (lambda + kappa)", "kappa + lambda","first hour", "last hour" ]]


    # Regressors = sm.add_constant(Regressors, prepend=False)

    model = OLS(Regressand, Regressors)

    res = model.fit()

    print(res.summary()) 

    rho = - res.params.iloc[0]
    kappa = - res.params.iloc[1] / rho
    lamb = res.params.iloc[2] / rho - kappa
    # lamb2 = res.params.iloc[3] - kappa
    

    print("rho = ", rho, "\nlambda = ", lamb, "\nkappa = ", kappa, "\n \n", end="")

    return rho, kappa, lamb


In [20]:
directory = "data/CU/03.03.2021/"
 
files = os.listdir(directory)

listQuadras = []

for file in files:
    if(file.endswith(".csv")):
        print("data/"+file)
        df = pd.read_csv(directory + file)
        total = len(df)
        lil = len(df[df["AskAfter"] - df["AskBefore"] <=0 ])
        me = np.mean(df["Volume"])
        spikes = len(df[((df["AskAfter"] - df["AskBefore"] > 0) & (df["Volume"] < me))])

        rho, kappa, lambd =    df_OLS_anal2(df)
        rho1, kappa1, lambd1 = df_OLS_anal2(df, 1)
                  
        listQuadras.append(np.array(
            [
            file[:-6]
            ,total
            ,lil
            # ,spikes
            # ,me * 1e1
            , lil/total
            , spikes/(total - lil)
            , rho
            , rho1
            ]
            # len(df[df["AskAfter"] - df["AskBefore"] <= 0  & df["Volume"] > 1e5])
            ))

listQuadras.sort(key=lambda x: int(x[1]))
print(listQuadras)

data/USD000TODTOMWP.csv
                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.023
Model:                            OLS   Adj. R-squared (uncentered):              0.016
Method:                 Least Squares   F-statistic:                              3.211
Date:                Sun, 28 Jan 2024   Prob (F-statistic):                      0.0128
Time:                        10:23:32   Log-Likelihood:                         -4129.2
No. Observations:                 543   AIC:                                      8266.
Df Residuals:                     539   BIC:                                      8284.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------

In [34]:
# from math import round
for q in listQuadras[::-1]:
    l = np.array([q[0]
                #   ,round(float(q[1] ) * 100)
                #   ,round(float(q[-4]) * 100) 
                #   ,round(float(q[-2]))
                #   ,round(float(q[-1])) 
                  ,float(q[-2])
                  ,float(q[-1])
                     ])
    print(" & ".join(l) +" \\\\ \hline")
    # print(q)

USD000UTSTOM & 7294.457869519454 & 57129.406993912766 \\ \hline
USD000000TOD & 25813.624196538978 & 20010.291587544452 \\ \hline
EUR_RUB__TOM & 2334.3201320044973 & 26246.564284673696 \\ \hline
EUR_RUB__TOD & 1670.5648026890167 & 14550.565149894639 \\ \hline
USD000TODTOM & -1.1241605443859487e-17 & -5.998178049291941e-19 \\ \hline
EURUSD000TOM & 85.32008726723969 & 0.31332458869318036 \\ \hline
GBPRUB_TOM & 10.79246478230363 & 1005.7813182919028 \\ \hline
EUR000TODTOM & 1980.197366721326 & 9463.17366325021 \\ \hline


In [35]:
df = pd.read_csv("data/CU/2020/USD000000TODWP.csv")

In [45]:
df = df[df["AskAfter"] - df["AskBefore"] > 0]

In [46]:
w_len = 100

ms = sum(np.diff(df["Time"])[0:w_len])
j_best = 0

for i in range(len(df) - w_len):
    ns = sum(np.diff(df["Time"])[i:i+w_len])
    if ns < ms:
        j_best = i

print(j_best)

print(ms / w_len)
print(np.min(np.diff(df["Time"])[j_best:j_best+w_len]))

1506
0.5499998674100062
1.0310031939297915e-06


In [47]:
df_OLS_anal2(df[1506:1606])

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.059
Model:                            OLS   Adj. R-squared (uncentered):              0.018
Method:                 Least Squares   F-statistic:                              1.453
Date:                Sun, 28 Jan 2024   Prob (F-statistic):                       0.223
Time:                        11:34:15   Log-Likelihood:                         -952.89
No. Observations:                  97   AIC:                                      1914.
Df Residuals:                      93   BIC:                                      1924.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

(22133.660634675896, 4.2572417971055905e-08, -5.031723840963348e-08)

In [115]:
for q in listQuadras[::-1]:
    l = np.array((q[0], q[1], q[-1], q[-2]))
    print(" & ".join(l) +" \\\\ \hline")

USD000UTSTOM & 41963 & -1.3454038118541596e-13 & -3.9993134823310335e-13 \\ \hline
USD000000TOD & 13391 & 3.953096901572806e-13 & 878971.6190915223 \\ \hline
EUR_RUB__TOM & 13383 & -1.6103787588652537e-13 & 149030.37989297457 \\ \hline
EUR_RUB__TOD & 4134 & -2.8968832214492e-11 & 100550.84664241684 \\ \hline
USD000TODTOM & 1343 & -3.4403061472022886e-17 & 1.881472333580073e-16 \\ \hline
EURUSD000TOM & 915 & -1.8793893072248073e-13 & -1.1984356617419773e-14 \\ \hline
EUR000TODTOM & 265 & 3.346062887920089e-18 & 225006.73004700168 \\ \hline
GBPRUB_TOM & 234 & 0.318196286111428 & -0.7899878452418242 \\ \hline
CNYRUB_TOM & 167 & 5.040051784091274e-13 & 6.186202700324166e-14 \\ \hline


In [55]:
df = pd.read_csv("data/CU/USD000UTSTOMWP.csv")

In [58]:
dummy_first = np.array((df["Time"] < 3600)[1:-2], dtype=float)
print(dummy_first)

[1. 1. 1. ... 0. 0. 0.]


In [19]:
len(df[df["AskAfter"] - df["AskBefore"] > 0])

2250

In [20]:
len(df)

41963

In [None]:
def df_OLS_anal_AO(df: pd.DataFrame):
    deltaA = np.array(df["AskAfter"] - df["AskBefore"])[1:-1]
    deltat = np.array(np.diff(df["Time"]))[1:]
    x = np.array(df["Volume"][1:-1])

    dA = np.array(deltaA / deltat)
    dx = np.array(x / deltat)
    ddx = np.array(np.diff(dx))
    ddA = np.array(np.diff(dA))

    OLSdf = pd.DataFrame({
        "SUM": ddA,
        "-rho": deltaA[:-1],
        "-rho kappa": x[1:],
        "rho (lambda + kappa)": x[:-1],
        "kappa + lambda": ddx
    })

    Regressand = OLSdf["SUM"]

    Regressors = OLSdf[["-rho", "-rho kappa", "rho (lambda + kappa)", "kappa + lambda"]]

    # Regressors = sm.add_constant(Regressors, prepend=False)

    model = OLS(Regressand, Regressors)

    res = model.fit()

    print(res.summary()) 

    rho = - res.params.iloc[0]
    kappa = - res.params.iloc[1] / rho
    lamb = res.params.iloc[2] / rho - kappa
    # lamb2 = res.params.iloc[3] - kappa
    

    print("rho = ", rho, "\nlambda = ", lamb, "\nkappa = ", kappa, "\n \n", end="")

In [90]:
directory = "data"
 
files = os.listdir(directory)

listQuadras = []

for file in files:
    if(file.endswith(".csv")):
        print("data/"+file)
        df = pd.read_csv("data/"+file)
        df_OLS_anal(df, 0)
        df_OLS_anal(df, 1)


data/CNYRUB_TOMWP.csv
                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.008
Model:                            OLS   Adj. R-squared (uncentered):             -0.017
Method:                 Least Squares   F-statistic:                             0.3130
Date:                Tue, 23 Jan 2024   Prob (F-statistic):                       0.869
Time:                        21:05:38   Log-Likelihood:                         -961.42
No. Observations:                 164   AIC:                                      1931.
Df Residuals:                     160   BIC:                                      1943.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
---------

In [17]:
df = pd.read_csv("EUR_RUB__TODWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.020
Model:                            OLS   Adj. R-squared (uncentered):              0.019
Method:                 Least Squares   F-statistic:                              21.45
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    1.55e-17
Time:                        16:57:17   Log-Likelihood:                         -36952.
No. Observations:                4131   AIC:                                  7.391e+04
Df Residuals:                    4127   BIC:                                  7.394e+04
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [6]:
df = pd.read_csv("EUR_RUB__TOMWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.004
Model:                            OLS   Adj. R-squared (uncentered):              0.003
Method:                 Least Squares   F-statistic:                              12.10
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    8.16e-10
Time:                        09:26:13   Log-Likelihood:                     -1.0975e+05
No. Observations:               13380   AIC:                                  2.195e+05
Df Residuals:                   13376   BIC:                                  2.195e+05
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [7]:
df = pd.read_csv("EUR000TODTOMWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.682
Model:                            OLS   Adj. R-squared (uncentered):              0.677
Method:                 Least Squares   F-statistic:                              138.6
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    4.95e-63
Time:                        09:26:32   Log-Likelihood:                         -1927.7
No. Observations:                 262   AIC:                                      3863.
Df Residuals:                     258   BIC:                                      3878.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [8]:
df = pd.read_csv("GBPRUB_TOMWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.435
Model:                            OLS   Adj. R-squared (uncentered):              0.425
Method:                 Least Squares   F-statistic:                              43.74
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    3.43e-27
Time:                        09:27:15   Log-Likelihood:                         -1462.7
No. Observations:                 231   AIC:                                      2933.
Df Residuals:                     227   BIC:                                      2947.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [9]:
df = pd.read_csv("USD000TODTOMWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.266
Model:                            OLS   Adj. R-squared (uncentered):              0.263
Method:                 Least Squares   F-statistic:                              120.8
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    4.68e-88
Time:                        09:28:11   Log-Likelihood:                         -10566.
No. Observations:                1340   AIC:                                  2.114e+04
Df Residuals:                    1336   BIC:                                  2.116e+04
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [10]:
df = pd.read_csv("USD000000TODWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.039
Model:                            OLS   Adj. R-squared (uncentered):              0.038
Method:                 Least Squares   F-statistic:                              134.1
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                   1.65e-112
Time:                        09:28:40   Log-Likelihood:                     -1.1489e+05
No. Observations:               13388   AIC:                                  2.298e+05
Df Residuals:                   13384   BIC:                                  2.298e+05
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [11]:
df = pd.read_csv("USD000UTSTOMWP.csv")
df_OLS_anal(df) 

                            OLS Regression Results                            
Dep. Variable:                    SUM   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     448.6
Date:                Mon, 22 Jan 2024   Prob (F-statistic):          6.81e-287
Time:                        09:28:57   Log-Likelihood:            -3.6245e+05
No. Observations:               41960   AIC:                         7.249e+05
Df Residuals:                   41956   BIC:                         7.249e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
-rho                 -1.835e+04 

In [12]:
df = pd.read_csv("EURUSD000TOMWP.csv")
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.127
Model:                            OLS   Adj. R-squared (uncentered):              0.124
Method:                 Least Squares   F-statistic:                              33.13
Date:                Mon, 22 Jan 2024   Prob (F-statistic):                    8.13e-26
Time:                        09:38:10   Log-Likelihood:                         -4369.6
No. Observations:                 912   AIC:                                      8747.
Df Residuals:                     908   BIC:                                      8766.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [46]:
df = pd.read_csv("DataWithoutPairs_USDRUB.csv")
df = df[df["Time"] > 3600]
df = df[df["Time"] < 30000]
df = df.reset_index()
df_OLS_anal(df)

                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.030
Model:                            OLS   Adj. R-squared (uncentered):              0.029
Method:                 Least Squares   F-statistic:                              47.30
Date:                Sun, 21 Jan 2024   Prob (F-statistic):                    3.21e-39
Time:                        12:20:39   Log-Likelihood:                         -56672.
No. Observations:                6109   AIC:                                  1.134e+05
Df Residuals:                    6105   BIC:                                  1.134e+05
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [47]:
df = pd.read_csv("DataWithoutPairs_USDRUB.csv")
df = df[df["AskAfter"] - df["AskBefore"] > 0]
df = df[df["AskAfter"] - df["AskBefore"] < 0.03]
print(len(df["AskAfter"] - df["AskBefore"]))
df_OLS_anal(df)

3330
                                 OLS Regression Results                                
Dep. Variable:                    SUM   R-squared (uncentered):                   0.194
Model:                            OLS   Adj. R-squared (uncentered):              0.193
Method:                 Least Squares   F-statistic:                              200.4
Date:                Sun, 21 Jan 2024   Prob (F-statistic):                   3.70e-154
Time:                        12:20:50   Log-Likelihood:                         -27271.
No. Observations:                3327   AIC:                                  5.455e+04
Df Residuals:                    3323   BIC:                                  5.457e+04
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------