# Compute Braycurtis distance matrix

In [3]:
import os
import numpy as np
import pandas as pd

In [4]:
# File paths (all stored under the "saved_matrices" folder)
OUTPUT_FOLDER = "saved_matrices"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
MATRIX_FILE = os.path.join(OUTPUT_FOLDER, "braycurtis_matrix_columns.npy")
FEATURE_NAMES_FILE = os.path.join(OUTPUT_FOLDER, "feature_names.txt")
OLD_TABLE_FILE = os.path.join(OUTPUT_FOLDER, "old_asv_table.csv")

global DIM, new_ddf


In [5]:
# A small epsilon to avoid division by zero
EPSILON = 1e-12

def update_progress(task, percent):
    """Print progress messages and flush immediately."""
    print(f"Progress Update: {task} - {percent}% complete", flush=True)

## Parallel functions for new table internal distances

In [6]:
# # Global variables for new table internal computations.
# new_table_global = None
# m_global = None

# def init_worker_new(table, m):
#     """Initializer for workers computing distances within new table."""
#     global new_table_global, m_global
#     new_table_global = table
#     m_global = m

def compute_new_col(i):
    """Compute Bray–Curtis distances for new table's column i vs. columns j > i.
    Returns (i, result_vector) where result_vector is a NumPy array of length m_global.
    """
    row_result = np.zeros(DIM, dtype=np.float64)
    x = new_ddf.iloc[:, i].values
    for j in range(i+1, DIM):
        y = new_ddf.iloc[:, j].values
        # Compute sum(x+y) with safeguard
        denom = np.sum(x + y)
        if denom < EPSILON:
            d = 0.0
        else:
            d = np.sum(np.abs(x - y)) / denom
        row_result[j] = d
    return i, row_result

In [7]:
# Global variables for cross-computation.
old_table_global = None
new_table_global_cross = None
n_old_global = None
m_global_for_cross = None  # number of new features

## Main code

In [9]:
new_ddf = pd.read_csv(
    "ASV_table_MA.txt",
    sep="\t",
    index_col=[0],
)

In [10]:
new_ddf.head()

Unnamed: 0_level_0,SRR17045222,SRR17045223,SRR17045226,SRR17045227
#NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ASV_1,932,0,0,0
ASV_10,0,373,0,0
ASV_100,0,135,0,0
ASV_1000,0,0,0,32
ASV_10000,0,1,10,0


In [11]:
new_ddf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17868 entries, ASV_1 to ASV_9999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   SRR17045222  17868 non-null  int64
 1   SRR17045223  17868 non-null  int64
 2   SRR17045226  17868 non-null  int64
 3   SRR17045227  17868 non-null  int64
dtypes: int64(4)
memory usage: 698.0+ KB


In [12]:


# new_ddf = new_ddf.map_partitions(lambda df: df.apply(pd.to_numeric, errors="coerce"))
non_nan = new_ddf.isnull().sum() == 0

good_cols = non_nan[non_nan].index.tolist()
new_ddf = new_ddf[good_cols]
print("Dropped NaN columns in new table")

new_features = list(new_ddf.columns)
DIM = len(new_features)
print("New features:", new_features, flush=True)

Dropped NaN columns in new table
New features: ['SRR17045222', 'SRR17045223', 'SRR17045226', 'SRR17045227']


In [13]:
# Check if an old table already exists.
if os.path.exists(OLD_TABLE_FILE) and os.path.exists(FEATURE_NAMES_FILE) and os.path.exists(MATRIX_FILE):
    print("Old table exists. Entering append mode.", flush=True)
    old_table = pd.read_csv(OLD_TABLE_FILE, index_col=0)
    old_features = list(old_table.columns)
    n_old = len(old_features)
    print(f"Loaded old table with {n_old} features.", flush=True)
    # Load the existing BC matrix.
    existing_matrix = np.load(MATRIX_FILE)
else:
    print("No old table found. This run will create the initial BC matrix.", flush=True)
    # Compute BC matrix for new table only (internal comparisons)
    M_new = np.zeros((DIM, DIM), dtype=np.float64)
    for i in range(DIM):
        _, row_result = compute_new_col(i)
        M_new[i, :] = row_result
        for j in range(i + 1, DIM):
            M_new[j, i] = row_result[j]
        update_progress(f"Processed new feature {i+1} of {DIM}", int((i+1)/DIM * 100))

    # Force symmetry and set diagonal to zero
    M_new = (M_new + M_new.T) / 2
    np.fill_diagonal(M_new, 0)

    # Save as the new BC matrix.
    np.save(MATRIX_FILE, M_new)
    with open(FEATURE_NAMES_FILE, "w") as f:
        for feat in new_features:
            f.write(feat + "\n")
    # Save new_table as old_table for future appends.
    new_ddf.to_csv(OLD_TABLE_FILE)
    print("Initial BC matrix computed and saved.", flush=True)
    update_progress("All tasks complete", 100)

Old table exists. Entering append mode.
Loaded old table with 4 features.


In [14]:
M_new.shape

NameError: name 'M_new' is not defined

In [42]:
M_new

array([[0.        , 0.49173138, 0.48670194, 0.49478497],
       [0.49173138, 0.        , 0.48606561, 0.48278666],
       [0.48670194, 0.48606561, 0.        , 0.48015266],
       [0.49478497, 0.48278666, 0.48015266, 0.        ]])