In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
FILES = {
    'df_0': r'C:\Users\User\OneDrive - ISEG\Desktop\Assessment Quant Researcher\0.parquet',
    'df_1': r'C:\Users\User\OneDrive - ISEG\Desktop\Assessment Quant Researcher\1.parquet', 
    'df_2': r'C:\Users\User\OneDrive - ISEG\Desktop\Assessment Quant Researcher\2.parquet'
}

In [3]:
loaded_files = {}
for name, file_path in FILES.items():
    df = pd.read_parquet(file_path)
    loaded_files[name] = df
    print(f"Loaded {name}")

Loaded df_0
Loaded df_1
Loaded df_2


#  Time Weighted Relative Quoted Spread

In [4]:
def calculate_daily_tws(df, name):
    """
    Calculate daily TWS from original high-frequency data
    """
    # Sort by timestamp and remove negative spreads
    df = df[df['ask'] > df['bid']].sort_values('timestamp').copy()
    
    # Extract date
    df['date'] = df['timestamp'].dt.date
    
    daily_tws = {}
    
    for date, day_data in df.groupby('date'):
        # Sort day data by timestamp
        day_data = day_data.sort_values('timestamp').copy()
        
        # Calculate relative spread
        day_data['relative_spread'] = (day_data['ask'] - day_data['bid']) / ((day_data['ask'] + day_data['bid']) / 2)
        
        # Calculate time intervals
        day_data['time_interval'] = day_data['timestamp'].diff().dt.total_seconds()
        
        # Remove first row (no time interval)
        day_data = day_data.iloc[1:]
        
        if len(day_data) > 0:
            # Calculate total time for this day
            total_time_T = day_data['time_interval'].sum()
            
            # Calculate daily TWS
            daily_tws_value = (1 / total_time_T) * (day_data['relative_spread'] * day_data['time_interval']).sum()
            daily_tws[date] = daily_tws_value
    
    return daily_tws



# Calculate daily TWS for all datasets

all_daily_tws = {}

for name, path in FILES.items():
    try:
        df = pd.read_parquet(path)
        daily_tws = calculate_daily_tws(df, name)
        all_daily_tws[name] = daily_tws
                
        # Calculate average of daily TWS
        avg_daily_tws = sum(daily_tws.values()) / len(daily_tws)
                
    except Exception as e:
        print(f"Failed to process {name}: {e}")

# Overall summary
print("OVERALL SUMMARY - AVERAGE OF DAILY TWS")
print(f"{'='*60}")

for name, daily_tws in all_daily_tws.items():
    avg_tws = sum(daily_tws.values()) / len(daily_tws)
    print(f"{name}: {avg_tws:.6f} ({avg_tws * 10000:.2f} bps) over {len(daily_tws)} days")

OVERALL SUMMARY - AVERAGE OF DAILY TWS
df_0: 0.000127 (1.27 bps) over 29 days
df_1: 0.009410 (94.10 bps) over 29 days
df_2: 0.000547 (5.47 bps) over 29 days


# CORWIN-SCHULTZ

In [5]:
def resample_daily_corwin(path, name):
    """Load, clean, and resample to daily for Corwin-Schultz"""
    df = pd.read_parquet(path)
    
    # Calculate spread and filter negative spreads
    df['spread'] = df['ask'] - df['bid']
    df = df[df['spread'] >= 0]
    
    # Resample to daily - using MAX ask and MIN bid for Corwin-Schultz
    df_daily = df.set_index('timestamp').resample('1D').agg({
        'bid': 'min',      
        'ask': 'max',       
        'spread': 'mean',  
    }).reset_index()
    

    return df_daily


resampled_daily = {}
for name, path in FILES.items():
    resampled_daily[name] = resample_daily_corwin(path, name)

In [6]:
def getBeta(series, sl):
    hl = series[['High', 'Low']].values  
    hl = np.log(hl[:, 0] / hl[:, 1]) ** 2
    hl = pd.Series(hl, index=series.index)
    beta = hl.rolling(window=2, min_periods=1).sum()
    beta = beta.rolling(window=sl, min_periods=1).mean()
    return beta.dropna()

def getGamma(series):
    h2 = series['High'].rolling(window=2, min_periods=1).max()
    l2 = series['Low'].rolling(window=2, min_periods=1).min()
    gamma = np.log(h2.values / l2.values) ** 2
    gamma = pd.Series(gamma, index=h2.index)
    return gamma.dropna()

def getAlpha(beta, gamma):
    den = 3 - 2 * 2 ** 0.5
    alpha = (2 ** 0.5 - 1) * (beta ** 0.5) / den
    alpha -= (gamma / den) ** 0.5
    alpha[alpha < 0] = 0
    return alpha.dropna()

def corwinSchultz(series, sl=1):
    beta = getBeta(series, sl)
    gamma = getGamma(series)
    alpha = getAlpha(beta, gamma)
    spread = 2 * (np.exp(alpha) - 1) / (1 + np.exp(alpha))
    startTime = pd.Series(series.index[0:spread.shape[0]], index=spread.index)
    spread = pd.concat([spread, startTime], axis=1)
    spread.columns = ['Spread', 'Start_Time']
    return spread

def getSigma(beta, gamma):
    k2 = (8 / np.pi) ** 0.5
    den = 3 - 2 * 2 ** 0.5
    sigma = (2 ** -0.5 - 1) * beta ** 0.5 / (k2 * den)
    sigma += (gamma / (k2 ** 2 * den)) ** 0.5
    sigma[sigma < 0] = 0
    return sigma


corwin_results = {}
for name, df_daily in resampled_daily.items():
    # Rename columns to match original code (High/Low instead of ask/bid)
    df_temp = df_daily.set_index('timestamp').copy()
    df_temp = df_temp.rename(columns={'ask': 'High', 'bid': 'Low'})
    
    cs_spread = corwinSchultz(df_temp, sl=1)
    avg_cs_spread = cs_spread['Spread'].mean()
    
    corwin_results[name] = {
        'corwin_spread': avg_cs_spread,
    }
    
    print(f"{name}: CS={avg_cs_spread:.6f}")

df_0: CS=0.001334
df_1: CS=0.000000
df_2: CS=0.001439


  hl = np.log(hl[:, 0] / hl[:, 1]) ** 2
  gamma = np.log(h2.values / l2.values) ** 2


# Roll's Measure

In [7]:
def process_data(path, name):
    """Load and clean data without resampling"""
    # Load data
    df = pd.read_parquet(path)
    # Filter data
    df = df[(df['ask'] > df['bid']) & (df['bid'] > 0) & (df['ask'] > 0)]
    # Calculate mid-price
    df['mid_price'] = (df['bid'] + df['ask']) / 2
    
    print(f"✅ {name}: {len(df):,} rows")
    return df

print("Loading and cleaning data (no resampling)")
print("=" * 40)

mid_price = {}
for name, path in FILES.items():
    mid_price[name] = process_data(path, name)

Loading and cleaning data (no resampling)
✅ df_0: 20,533,743 rows
✅ df_1: 22,582,671 rows
✅ df_2: 10,329,249 rows


In [8]:
def compute_price_diff(df):
    """
    Compute the difference between consecutive mid_price values.
    """
    df['price_diff'] = df['mid_price'].diff()
    df_clean = df.dropna(subset=['price_diff'])
    return df_clean

def autocovariance_lag1(price_diffs):
    """
    Calculate autocovariance of lag 1 for the price differences.
    """
    price_diffs = np.array(price_diffs)
    n = len(price_diffs)
    if n < 2:
        return np.nan
    gamma_1 = np.dot(price_diffs[:-1], price_diffs[1:]) / (n - 1)
    return gamma_1

def estimate_spread(price_diffs):
    """
    Estimate spread using autocovariance lag 1 of price differences.
    """
    gamma_1 = autocovariance_lag1(price_diffs)
    if gamma_1 < 0:
        return 2 * np.sqrt(-gamma_1)
    else:
        return np.nan  # or handle as needed

# Initialize a results dictionary
spread_results = {}

# Loop through each dataset and compute the spread using price differences
for name, df in mid_price.items():
    df_diff = compute_price_diff(df)
    price_diffs = df_diff['price_diff'].values
    gamma_1 = autocovariance_lag1(price_diffs)
    spread = estimate_spread(price_diffs)
    spread_results[name] = {
        'autocovariance_lag1': gamma_1,
        'estimated_spread': spread
    }
    print(f"{name}: Autocovariance lag 1 = {gamma_1:.6f}, Estimated Spread = {spread:.6f}")

# Final results
print("\nSummary of Spread Estimates based on Price Differences:")
for name, results in spread_results.items():
    print(f"{name}: Spread = {results['estimated_spread']:.6f}")

df_0: Autocovariance lag 1 = 0.000002, Estimated Spread = nan
df_1: Autocovariance lag 1 = -0.000248, Estimated Spread = 0.031467
df_2: Autocovariance lag 1 = -0.000076, Estimated Spread = 0.017468

Summary of Spread Estimates based on Price Differences:
df_0: Spread = nan
df_1: Spread = 0.031467
df_2: Spread = 0.017468


# Roll Model with CSS extensions

In [9]:
def resample_to_1min(df, name):
    """Resample data to 1-minute intervals"""
    df_1min = df.set_index('timestamp').resample('1T').agg({
        'bid': 'mean',
        'ask': 'mean', 
        'mid_price': 'mean'
    }).reset_index()
    
    print(f"✅ {name}: {len(df_1min):,} rows (1-min)")
    return df_1min

print("1-MINUTE RESAMPLING")
print("=" * 40)

resampled_1min = {}
for name, df in mid_price.items():
    resampled_1min[name] = resample_to_1min(df, name)

print(f"\n✅ All datasets resampled to 1-minute intervals")

1-MINUTE RESAMPLING


  df_1min = df.set_index('timestamp').resample('1T').agg({


✅ df_0: 46,940 rows (1-min)


  df_1min = df.set_index('timestamp').resample('1T').agg({


✅ df_1: 46,945 rows (1-min)


  df_1min = df.set_index('timestamp').resample('1T').agg({


✅ df_2: 46,940 rows (1-min)

✅ All datasets resampled to 1-minute intervals


Delta

In [10]:
def calculate_same_dir_prob(df):
    """Simplest version to calculate same direction probability"""
    price_changes = df['mid_price'].diff().dropna()
    
    # Get directions (1 for up, -1 for down, remove zeros)
    directions = np.sign(price_changes)
    directions = directions[directions != 0]  # Remove flat movements
    
    # Calculate same direction transitions
    same_dir = (directions.shift(1) == directions).sum()
    total_trans = len(directions) - 1  
    
    # delta is the probability of same direction
    delta = same_dir / total_trans if total_trans > 0 else 0
    
    return delta


deltas = {}
for name, df in resampled_1min.items(): 
    delta = calculate_same_dir_prob(df)
    deltas[name] = delta
    print(f"{name}: δ = {delta:.4f}")

df_0: δ = 0.5645
df_1: δ = 0.5649
df_2: δ = 0.5640


In [11]:
def resample_1min_bid_ask(df):
    """Resample data to 1-minute intervals"""
    return df.set_index('timestamp').resample('1T').agg({
        'bid': 'mean',
        'ask': 'mean', 
    }).reset_index()


resample_bid_ask = {}
for name, df in loaded_files.items():
    resample_bid_ask[name] = resample_1min_bid_ask(df)
    print(f"✅ Resampled {name}: {len(resampled_1min[name]):,} rows")

print(f"\n✅ All datasets resampled to 1-minute intervals")

  return df.set_index('timestamp').resample('1T').agg({


✅ Resampled df_0: 46,940 rows


  return df.set_index('timestamp').resample('1T').agg({


✅ Resampled df_1: 46,945 rows


  return df.set_index('timestamp').resample('1T').agg({


✅ Resampled df_2: 46,940 rows

✅ All datasets resampled to 1-minute intervals


In [12]:
def roll_css(df, delta):
    """
    Compute Roll model metrics for a dataset
    """      
    # Calculate actual spread from bid-ask
    actual_spread = (df['ask'] - df['bid']).mean()
        
    # Theoretical covariance with delta 
    CSS = -(actual_spread ** 2) * (1 - delta) ** 2

    return CSS

roll_css_results = {}
for name, df in resample_bid_ask.items():
    delta = deltas.get(name)
    results = roll_css(df, delta)
    roll_css_results[name] = results
    
    print(f"\n{name}:")
    print(f"  δ: {delta:.4f}")
    print(f"  The Roll Model with CSS extension: {results:.8f}")
    


df_0:
  δ: 0.5645
  The Roll Model with CSS extension: -0.00554989

df_1:
  δ: 0.5649
  The Roll Model with CSS extension: -0.04347072

df_2:
  δ: 0.5640
  The Roll Model with CSS extension: -0.00888808


N = 2

In [13]:
def roll_css(df, delta):
    """
    Compute Roll model metrics for a dataset
    """      
    # Calculate actual spread from bid-ask
    actual_spread = (df['ask'] - df['bid']).mean()
        
    # Theoretical covariance with delta 
    CSS = -(actual_spread ** 2) * (2 * (1 - delta) * delta) ** 2

    return CSS

roll_css_results = {}
for name, df in resample_bid_ask.items():
    delta = deltas.get(name)
    results = roll_css(df, delta)
    roll_css_results[name] = results
    
    print(f"\n{name}:")
    print(f"  δ: {delta:.4f}")
    print(f"  The Roll Model with CSS extension: {results:.8f}")
    


df_0:
  δ: 0.5645
  The Roll Model with CSS extension: -0.00707471

df_1:
  δ: 0.5649
  The Roll Model with CSS extension: -0.05548917

df_2:
  δ: 0.5640
  The Roll Model with CSS extension: -0.01131044


N = 3

In [14]:
def roll_css(df, delta):
    """
    Compute Roll model metrics for a dataset
    """      
    # Calculate actual spread from bid-ask
    actual_spread = (df['ask'] - df['bid']).mean()
        
    # Theoretical covariance with delta 
    CSS = -(actual_spread ** 2) * (3 * delta ** 2 * (1 - delta) + (1 - delta) ** 3) ** 2

    return CSS

roll_css_results = {}
for name, df in resample_bid_ask.items():
    delta = deltas.get(name)
    results = roll_css(df, delta)
    roll_css_results[name] = results
    
    print(f"\n{name}:")
    print(f"  δ: {delta:.4f}")
    print(f"  The Roll Model with CSS extension: {results:.8f}")
    


df_0:
  δ: 0.5645
  The Roll Model with CSS extension: -0.00728496

df_1:
  δ: 0.5649
  The Roll Model with CSS extension: -0.05715673

df_2:
  δ: 0.5640
  The Roll Model with CSS extension: -0.01164174


N = 4

In [15]:
def roll_css(df, delta):
    """
    Compute Roll model metrics for a dataset
    """      
    # Calculate actual spread from bid-ask
    actual_spread = (df['ask'] - df['bid']).mean()
        
    # Theoretical covariance with delta
    CSS = -(actual_spread ** 2) * (4 * delta ** 3 * (1 - delta) + 4 * delta * (1 - delta) ** 3) ** 2
        
    return CSS

roll_css_results = {}
for name, df in resample_bid_ask.items():
    delta = deltas.get(name)
    results = roll_css(df, delta)
    roll_css_results[name] = results
    
    print(f"\n{name}:")
    print(f"  δ: {delta:.4f}")
    print(f"  The Roll Model with CSS extension: {results:.8f}")
    


df_0:
  δ: 0.5645
  The Roll Model with CSS extension: -0.00731231

df_1:
  δ: 0.5649
  The Roll Model with CSS extension: -0.05737501

df_2:
  δ: 0.5640
  The Roll Model with CSS extension: -0.01168451


N = 5

In [16]:
def roll_css(df, delta):
    """
    Compute Roll model metrics for a dataset
    """      
    # Calculate actual spread from bid-ask
    actual_spread = (df['ask'] - df['bid']).mean()
        
    # Theoretical covariance with delta 
    CSS = -(actual_spread ** 2) * (5 * delta ** 4 * (1 - delta) + 10 * delta ** 2 * (1 - delta) ** 3 + (1 - delta)) ** 2
        
    return CSS

roll_css_results = {}
for name, df in resample_bid_ask.items():
    delta = deltas.get(name)
    results = roll_css(df, delta)
    roll_css_results[name] = results
    
    print(f"\n{name}:")
    print(f"  δ: {delta:.4f}")
    print(f"  The Roll Model with CSS extension: {results:.8f}")
    


df_0:
  δ: 0.5645
  The Roll Model with CSS extension: -0.02475938

df_1:
  δ: 0.5649
  The Roll Model with CSS extension: -0.19414105

df_2:
  δ: 0.5640
  The Roll Model with CSS extension: -0.03959747
