In [15]:
import pandas as pd
import numpy as np

def compute_adjusted_covariance(df1, df2, timestep, start_time, end_time, time_col, value_col):
    """
    Compute a single covariance value for two stocks over a specified time range.
    
    Parameters:
    - df1, df2: DataFrames with sorted integer times in column time_col and values in value_col
    - timestep: float or int, the increment between time steps
    - start_time, end_time: int, the inclusive time range to consider
    - time_col, value_col: int, column indices for time and value columns
    
    Returns:
    - covariance: float, the adjusted covariance scalar:
      Cov(Yi, Yj) = 1/Q{ij} sum_{t in Q{ij}} Y_it * Y_jt 
    - df1 = stock_i, df2 = stock_j, Q{ij}=#{t: both Y_it and Y_jt are present (not missing)})
    """
    # Adjust times to start from 0
    t1 = np.array(df1.iloc[:, time_col]).astype(int)
    t2 = np.array(df2.iloc[:, time_col]).astype(int)
    # Calculate T to define the valid range
    T = int((end_time - start_time) / timestep) + 1 # Length of the new dfs
    
    # Discard entries outside the range [0, T-1] (only keep masked range)
    mask1 = (t1 - start_time >= 0) & (t1 - end_time <= 0)
    mask2 = (t2 - start_time >= 0) & (t2 - end_time <= 0)
    df1_filtered = df1[mask1]
    df2_filtered = df2[mask2]

    # change time col to [0,1,...,T-1]
    df1_filtered.iloc[:, time_col] = (t1[mask1] - start_time) / timestep
    df2_filtered.iloc[:, time_col] = (t2[mask2] - start_time) / timestep
    
    # Find common observation times (Q_{ij}) by merging on adjusted times
    # This performs an inner join, meaning only rows with matching time values in both DataFrames will be kept.
    merged = pd.merge(
        df1_filtered[[time_col, value_col]],
        df2_filtered[[time_col, value_col]],
        on=time_col,
        how='inner'
    )
    
    # If no common times, return 0
    if merged.empty:
        return 0.0
    # Compute the sum of products Y_{it} * Y_{jt}
    sum_products = (merged.iloc[:, 1] * merged.iloc[:, 2]).sum()
    covariance = sum_products / len(merged)
    return covariance



#### Compute covariance matrix from log-returns

In [3]:
import pandas as pd
import numpy as np

def compute_covariance(df1, df2):
    """
    Compute a single covariance value for two stocks over the available time range.
    
    Parameters:
    - df1, df2: DataFrames with 'index' (sorted integer times indices 0,1,2...) and 'log_return'
    - all dfs should be over the same timeframe
    Returns:
    - covariance: float, the adjusted covariance scalar:
      Cov(Yi, Yj) = 1/Q{ij} sum_{t in Q{ij}} Y_it * Y_jt 
    - df1 = stock_i, df2 = stock_j, Q{ij}=#{t: both Y_it and Y_jt are present (not missing)})
    """
    
    # Find common observation times (Q_{ij}) by merging on adjusted times
    # This performs an inner join, meaning only rows with matching time values in both DataFrames will be kept.
    merged = pd.merge(
        df1[['index', 'log_return']],
        df2[['index', 'log_return']],
        on='index',
        how='inner'
    )
    
    # If no common times, return 0
    if merged.empty:
        return 0.0
    # Compute the sum of products Y_{it} * Y_{jt}
    sum_products = (merged.iloc[:, 1] * merged.iloc[:, 2]).sum()
    covariance = sum_products / len(merged)
    return covariance



In [9]:
with open("USD_60_filenames_parquet.txt", "r") as file:
    filepaths = [line.strip() for line in file if line.strip()]

['USD_60_2022-2025/BNTUSD_60.parquet', 'USD_60_2022-2025/REPUSD_60.parquet', 'USD_60_2022-2025/CTSIUSD_60.parquet', 'USD_60_2022-2025/KARUSD_60.parquet', 'USD_60_2022-2025/BNCUSD_60.parquet', 'USD_60_2022-2025/BANDUSD_60.parquet', 'USD_60_2022-2025/KEEPUSD_60.parquet', 'USD_60_2022-2025/OGNUSD_60.parquet', 'USD_60_2022-2025/LSKUSD_60.parquet', 'USD_60_2022-2025/REPV2USD_60.parquet', 'USD_60_2022-2025/GNOUSD_60.parquet', 'USD_60_2022-2025/KNCUSD_60.parquet', 'USD_60_2022-2025/GHSTUSD_60.parquet', 'USD_60_2022-2025/MLNUSD_60.parquet', 'USD_60_2022-2025/ICXUSD_60.parquet', 'USD_60_2022-2025/RARIUSD_60.parquet', 'USD_60_2022-2025/YFIUSD_60.parquet', 'USD_60_2022-2025/MIRUSD_60.parquet', 'USD_60_2022-2025/LPTUSD_60.parquet', 'USD_60_2022-2025/CQTUSD_60.parquet', 'USD_60_2022-2025/KILTUSD_60.parquet', 'USD_60_2022-2025/RENUSD_60.parquet', 'USD_60_2022-2025/QTUMUSD_60.parquet', 'USD_60_2022-2025/SDNUSD_60.parquet', 'USD_60_2022-2025/OXTUSD_60.parquet', 'USD_60_2022-2025/SRMUSD_60.parquet', 'U

In [11]:
# Compute adjusted covariance
df1 = pd.read_parquet(filepaths[0])
df2 = pd.read_parquet(filepaths[1])

cov12 = compute_covariance(df1, df2)
cov21 = compute_covariance(df2, df1)
print(cov12, cov21)

2.4857356274699452e-05 2.4857356274699452e-05


In [None]:
# Build covariance matrix using adjusted PCA for missing data
import numpy as np

N = len(filepaths)
C = np.zeros((N, N))
dfs = [pd.read_parquet(fp) for fp in filepaths]

for i in range(N):
    for j in range(i, N):
        cov = compute_covariance(dfs[i], dfs[j])
        C[i, j] = cov
        if j != i:
            C[j, i] = cov  # make symmetric

# Save covariance matrix to a .npy file
# .npy is the best format for numpy arrays as it preserves the array structure and data type
print("Symmetric covariance matrix:")
print(C)
print("\nCovariance matrix saved to 'covariance_matrix.npy'")

Symmetric covariance matrix:
[[1.49556270e-04 2.48573563e-05 7.25334956e-05 ... 4.38941751e-05
  5.46671869e-05 3.13944912e-05]
 [2.48573563e-05 6.48042293e-04 2.59410302e-05 ... 1.27746143e-05
  1.62781736e-05 1.20186884e-05]
 [7.25334956e-05 2.59410302e-05 1.93590965e-04 ... 5.47049880e-05
  7.22196204e-05 4.05847298e-05]
 ...
 [4.38941751e-05 1.27746143e-05 5.47049880e-05 ... 5.42416254e-05
  5.44069788e-05 3.61869329e-05]
 [5.46671869e-05 1.62781736e-05 7.22196204e-05 ... 5.44069788e-05
  1.14522982e-04 4.14788090e-05]
 [3.13944912e-05 1.20186884e-05 4.05847298e-05 ... 3.61869329e-05
  4.14788090e-05 3.34184353e-05]]


In [15]:
np.save('USD_60_2022_01_01-2025_03_31_covariance_logreturn.npy', C)