<a href="https://colab.research.google.com/github/SenorFoca/High-Frecuency-Trading/blob/main/HFT_CW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install linearmodels



In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/SenorFoca/High-Frecuency-Trading/refs/heads/main/GSK_01022011.csv')

# Display rows
data

Unnamed: 0,#RIC,Date-Time,Type,Price,Volume,Bid Price,Bid Size,Ask Price,Ask Size,direction
0,GSK.L,2011-02-01T07:50:00.089774000Z,Quote,,,1127.0,135.0,1172.0,2000.0,0
1,GSK.L,2011-02-01T07:50:02.746387000Z,Quote,,,1127.0,135.0,1095.0,5726.0,0
2,GSK.L,2011-02-01T07:50:02.746387000Z,Auction,1125.000,5726.0,1127.0,,1095.0,,0
3,GSK.L,2011-02-01T07:50:02.746387000Z,Quote,,,1127.0,135.0,1095.0,8252.0,0
4,GSK.L,2011-02-01T07:50:02.746387000Z,Auction,1115.000,8252.0,1127.0,,1095.0,,0
...,...,...,...,...,...,...,...,...,...,...
55831,GSK.L,2011-02-01T16:44:05.285985000Z,Trade,1146.500,62906.0,1146.0,,1146.5,,1
55832,GSK.L,2011-02-01T16:48:34.299775000Z,Trade,1137.313,25869.0,1146.0,,1146.5,,-1
55833,GSK.L,2011-02-01T16:50:29.594990000Z,Correction,,,1146.0,,1146.5,,0
55834,GSK.L,2011-02-01T16:50:29.594990000Z,Trade,1145.680,9570.0,1146.0,,1146.5,,-1


In [None]:
# Convert the 'Date-Time' column from string to datetime object
data['Date-Time'] = pd.to_datetime(data['Date-Time'])

# Calculate time in seconds since midnight for each timestamp
data['time_S'] = data['Date-Time'].dt.hour * 3600 + data['Date-Time'].dt.minute * 60 + data['Date-Time'].dt.second + data['Date-Time'].dt.microsecond / 1e6

# Round the time to two decimal
data['time_S'] = data['time_S'].round(2)

# Preview the data
data.head()

Unnamed: 0,#RIC,Date-Time,Type,Price,Volume,Bid Price,Bid Size,Ask Price,Ask Size,direction,time_S
0,GSK.L,2011-02-01 07:50:00.089774+00:00,Quote,,,1127.0,135.0,1172.0,2000.0,0,28200.09
1,GSK.L,2011-02-01 07:50:02.746387+00:00,Quote,,,1127.0,135.0,1095.0,5726.0,0,28202.75
2,GSK.L,2011-02-01 07:50:02.746387+00:00,Auction,1125.0,5726.0,1127.0,,1095.0,,0,28202.75
3,GSK.L,2011-02-01 07:50:02.746387+00:00,Quote,,,1127.0,135.0,1095.0,8252.0,0,28202.75
4,GSK.L,2011-02-01 07:50:02.746387+00:00,Auction,1115.0,8252.0,1127.0,,1095.0,,0,28202.75


In [None]:
# Set 'Date-Time' column as the DataFrame index
data.set_index('Date-Time', inplace=True)

In [None]:
# Filter data based on time range
data = data[(data['time_S'] >= 28920) & (data['time_S'] <= 59400)]


## Q1

### Q1.1: Midquote series

In [None]:
# Fill missing values in 'Ask Price' and 'Bid Price' using forward fill
data = data.copy()
data['Ask Price'] = data['Ask Price'].ffill()
data['Bid Price'] = data['Bid Price'].ffill()

# Calculate spread and midquote
data['spread'] = data['Ask Price'] - data['Bid Price']
data['midquote'] = 0.5 * (data['Ask Price'] + data['Bid Price'])
data.head()

Unnamed: 0_level_0,#RIC,Type,Price,Volume,Bid Price,Bid Size,Ask Price,Ask Size,direction,time_S,spread,midquote
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-02-01 08:02:00.615896+00:00,GSK.L,Quote,,,1133.0,305.0,1137.0,5842.0,0,28920.62,4.0,1135.0
2011-02-01 08:02:00.615896+00:00,GSK.L,Quote,,,1133.0,305.0,1137.0,7025.0,0,28920.62,4.0,1135.0
2011-02-01 08:02:02.162469+00:00,GSK.L,Quote,,,1131.5,4275.0,1137.0,7025.0,0,28922.16,5.5,1134.25
2011-02-01 08:02:02.168743+00:00,GSK.L,Quote,,,1131.5,4700.0,1137.0,7025.0,0,28922.17,5.5,1134.25
2011-02-01 08:02:02.168743+00:00,GSK.L,Quote,,,1131.5,5186.0,1137.0,7025.0,0,28922.17,5.5,1134.25


### Q1.2: Time-weighted daily best bid-ask spread

In [None]:
# Compute time change between successive rows
timechange = data.time_S.shift(-1) - data.time_S
timechange.iloc[-1] = 59400 - data['time_S'].iloc[-1]

qspread = 10000 * (data['Ask Price'] - data['Bid Price']) / data['midquote']   # in percentage

TWqspread = np.nansum(qspread * timechange) / np.nansum(timechange)
print(TWqspread)

5.941482978896898


### Q1.3: Time-weighted daily market depth

In [None]:
# Compute time-weighted market depth
depth =data['Ask Size'] + data['Bid Size']
TWdepth = np.nansum(depth * timechange) / np.nansum(timechange)

print(TWdepth)

19314.925984386966


### Q1.4: Volume-weighted daily effective spread

In [None]:
# Add sequential index to each row
data['seq'] = range(len(data))

In [None]:
# Filter trades and calculate effective spread
dataTrades = data[(data['Type'] == 'Trade')]
espread = 10000 * 2 * (dataTrades['direction'] * (dataTrades['Price'] - dataTrades['midquote'])) / dataTrades['midquote']
VWdespread = np.nansum(espread * dataTrades['Volume']) / np.nansum(dataTrades['Volume'])

print(VWdespread)

8.194058522294055


### Q1.5: Volume-weighted daily 5-minute price impact

In [None]:
# Define trade horizon tau in seconds
tau = [300]

# Iterate over tau to calculate next midquote and price impact
for t in tau:
    nexMQ = []
    thiMQ = []

    for j in dataTrades.seq:
        thisMQ = data.iloc[j].midquote
        thisSec = data.iloc[j].time_S
        zz = data['time_S'] - (thisSec + 300)
        zz=pd.concat([zz,data.seq],axis=1)
        xx = zz[zz >= 0].seq

        # Get the first index where time is greater than or equal to thisSec + t
        if not xx.empty:
            xk = xx.iloc[0]
        else:
            xk = np.nan

        # Handle missing values
        if pd.isna(xk):
            nextMQ = np.nan
        else:
            nextMQ = data.iloc[xk].midquote

        thiMQ.append(thisMQ)
        nexMQ.append(nextMQ)

    # Add next midquote to dataTrades if lengths match
    if len(nexMQ) == len(dataTrades):
        dataTrades[f'nextMQ_tau_{t}'] = nexMQ
    else:
        print(f"Length mismatch for tau={t}, skipping this tau.")
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTrades[f'nextMQ_tau_{t}'] = nexMQ


In [None]:
# Calculate price impact
PI = 100 * 2 * (dataTrades['direction'] * (dataTrades['nextMQ_tau_300'] - thiMQ)) / dataTrades['midquote']
VWpi = np.nansum(PI * dataTrades['Volume']) / np.nansum(dataTrades['Volume'])

print(VWpi)

-0.08191165149128508


### Q1.6: Volume-weighted daily 5-minute realised spread

In [None]:
# Calculate realized spread
rspread = 100 * 2 * (dataTrades['direction'] * (dataTrades['Price'] - dataTrades['nextMQ_tau_300'])) / dataTrades['midquote']
VWrspread = np.nansum(rspread * dataTrades['Volume']) / np.nansum(dataTrades['Volume'])

print(VWrspread)

0.1638522367142256


### Q1.7: Daily realised volatility

In [None]:
# Resample midquote to 5-minute intervals and compute log returns
resampled = data['midquote'].resample('5min').last()
log_returns = np.log(resampled / resampled.shift(1))

# Realized volatility
realised_volatility =  np.sqrt(np.nansum(log_returns))

print(realised_volatility)

0.08765305652552965


### Q1.8: Tick return series based on midquote

In [None]:
# Tick-by-tick log return series based on midquote
data['tick_return'] = np.log(data['midquote'] / data['midquote'].shift(1))

tick_returns = data['tick_return']
print(tick_returns.head())

Date-Time
2011-02-01 08:02:00.615896+00:00         NaN
2011-02-01 08:02:00.615896+00:00    0.000000
2011-02-01 08:02:02.162469+00:00   -0.000661
2011-02-01 08:02:02.168743+00:00    0.000000
2011-02-01 08:02:02.168743+00:00    0.000000
Name: tick_return, dtype: float64


### Q1.9: 5-minute return series based on midquote

In [None]:
# 5-minute return series based on midquote
mid_5min_returns = np.log(resampled / resampled.shift(1))

print(mid_5min_returns.dropna().head())

Date-Time
2011-02-01 08:05:00+00:00   -0.000441
2011-02-01 08:10:00+00:00   -0.001103
2011-02-01 08:15:00+00:00   -0.002209
2011-02-01 08:20:00+00:00   -0.001771
2011-02-01 08:25:00+00:00   -0.000665
Freq: 5min, Name: midquote, dtype: float64


### Q1.10 5-minute order imbalance series


In [None]:
##Order Imbalance
data['SignedVolume']=data['Volume']*data['direction']
OrderFlows = data['SignedVolume'].resample('5min').sum()
TotalVolume = data['Volume'].resample('5min').sum()
OrderImbalance=abs(OrderFlows)/TotalVolume

print(OrderImbalance.dropna().head())

Date-Time
2011-02-01 08:00:00+00:00    0.092534
2011-02-01 08:05:00+00:00    0.285091
2011-02-01 08:10:00+00:00    0.032638
2011-02-01 08:15:00+00:00    0.119250
2011-02-01 08:20:00+00:00    0.053576
Freq: 5min, dtype: float64


## Q2

In [None]:
from linearmodels.system import SUR
import statsmodels.api as sm
from scipy.stats import f

### Q2.a

In [None]:
## Prepare variables for regressions
panel=pd.DataFrame()
panel['d'] = dataTrades['direction']
panel['delta_d'] = dataTrades['direction'] - dataTrades['direction'].shift(1)
panel['q'] = dataTrades['direction'] * dataTrades['Volume']   #quantity can be rescaled by daily average trading volume
panel['delta_q'] = panel['q'] - panel['q'].shift(1)
panel['q_lag'] = panel['q'].shift(1)
panel['p'] = dataTrades['Price']
panel['delta_p'] = panel['p'] - panel['p'].shift(1)
panel=panel.dropna()



In [None]:
# OLS regression
model = sm.OLS(panel['delta_p'] ,panel.iloc[:,:2] )
results1 = model.fit()
coeffs1=results1.params

print(results1.summary())

                                 OLS Regression Results                                
Dep. Variable:                delta_p   R-squared (uncentered):                   0.145
Model:                            OLS   Adj. R-squared (uncentered):              0.144
Method:                 Least Squares   F-statistic:                              733.0
Date:                Wed, 02 Apr 2025   Prob (F-statistic):                   7.36e-295
Time:                        19:45:52   Log-Likelihood:                         -2267.6
No. Observations:                8681   AIC:                                      4539.
Df Residuals:                    8679   BIC:                                      4553.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
# Define equations
equations = {
    'eq1': 'q ~ q_lag',  # Equation 1: AR(1) for q_t
    'eq2': 'delta_p ~ q + q_lag + delta_d'  # Equation 2: Price impact regression
}

# Fit the SUR model
sur_model = SUR.from_formula(equations, panel)
results = sur_model.fit()

print(results.summary)

                           System GLS Estimation Summary                           
Estimator:                        GLS   Overall R-squared:                   0.0015
No. Equations.:                     2   McElroy's R-squared:                 0.0922
No. Observations:                8681   Judge's (OLS) R-squared:             0.0013
Date:                Wed, Apr 02 2025   Berndt's R-squared:                  0.1499
Time:                        19:45:52   Dhrymes's R-squared:                 0.0015
                                        Cov. Estimator:                      robust
                                        Num. Constraints:                      None
                     Equation: eq1, Dependent Variable: q                     
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
q_lag          0.0383     0.0155     2.4791     0.0132      0.0080      0.0686
            

In [None]:
# Use estimated coefficients to compute structural parameters
phi = 0.0383
lambda_phi = -8.62e-6
lambda_plus_beta = 8.97e-6
gamma = 0.1593

# Derive lambda and beta based on SUR estimates
lambda_ = abs(lambda_phi) / phi
beta = lambda_plus_beta - lambda_

# Print estimated structural parameters
print("Estimated Parameters:")
print(f"φ (phi): {phi}")
print(f"Estimated λ (lambda): {lambda_:.6e}")
print(f"Estimated β (beta): {beta:.6e}")
print(f"γ (gamma): {gamma}")

Estimated Parameters:
φ (phi): 0.0383
Estimated λ (lambda): 2.250653e-04
Estimated β (beta): -2.160953e-04
γ (gamma): 0.1593
