In [1]:
import numpy as np
import os
import numpy as np
from scipy.stats import ttest_rel

In [2]:
def paired_ttests(A: np.ndarray,
                  B: np.ndarray,
                  alternative: str = "two-sided"):
    """
    Run paired t-tests between A and B for each of the two channels (axis=1),
    using only the first value in the last dimension (axis=2).

    Parameters
    ----------
    A, B : np.ndarray
        Arrays of shape (N, 2, 3), scores from two models on the same data.
    alternative : {"two-sided", "greater", "less"}
        Type of test:
          - "two-sided": H0 is mean(A - B) == 0
          - "greater":   H0 is mean(A - B) <= 0 (A not better than B)
          - "less":      H0 is mean(A - B) >= 0 (A not worse than B)

    Returns
    -------
    t_vals : np.ndarray
        Shape (2,), t-statistics for channels 0 and 1.
    p_vals : np.ndarray
        Shape (2,), adjusted p-values for chosen alternative.
    """
    if A.shape != B.shape:
        raise ValueError(f"Shape mismatch: A{A.shape} vs B{B.shape}")
    if A.ndim != 3 or A.shape[1] != 2 or A.shape[2] < 1:
        raise ValueError("Expected shape (N, 2, 3) with last dim >= 1.")

    # Take only F1 -> shape (N, 2)
    A_F1 = A[:, :, -1]
    B_F1 = B[:, :, -1]

    t_vals = np.empty(2, dtype=float)
    p_vals = np.empty(2, dtype=float)

    for j in range(2):
        a = A_F1[:, j]
        b = B_F1[:, j]

        t, p_two = ttest_rel(a, b)

        if alternative == "two-sided":
            p_adj = p_two
        elif alternative == "greater":  # test if mean(A) > mean(B)
            if np.mean(a - b) > 0:
                p_adj = p_two / 2
            else:
                p_adj = 1 - p_two / 2
        elif alternative == "less":  # test if mean(A) < mean(B)
            if np.mean(a - b) < 0:
                p_adj = p_two / 2
            else:
                p_adj = 1 - p_two / 2
        else:
            raise ValueError("alternative must be 'two-sided', 'greater', or 'less'")

        t_vals[j] = t
        p_vals[j] = p_adj

    return t_vals, p_vals


# Aggregate batches
def load_batches_results(base_path, experiment_foldername, folders_name):
    corrs_all = []
    labels_all = []
    paths_all = []

    for folder_name in folders_name:
        folder_path = os.path.join(base_path, 'figures', experiment_foldername, 'attn_correlations', folder_name)
        corrs = np.load(os.path.join(folder_path, 'testset_corrs.npy'))
        labels = np.load(os.path.join(folder_path, 'testset_labels.npy'))
        paths = np.load(os.path.join(folder_path, 'testset_paths.npy'))

        corrs_all.append(corrs)
        labels_all.extend(labels)
        paths_all.extend(paths)

    corrs_all = np.vstack(corrs_all)
    labels_all = np.asarray(labels_all)
    paths_all = np.asarray(paths_all)

    return corrs_all, labels_all, paths_all

def filter_by_overlapping_paths(corrs1, labels1, paths1, corrs2, labels2, paths2):
    # Find overlapping paths
    set_paths1 = set(paths1)
    set_paths2 = set(paths2)
    overlapping_paths = np.array(list(set_paths1.intersection(set_paths2)))

    # Filter corrs, labels, paths to only include overlapping paths
    mask1 = np.isin(paths1, overlapping_paths)
    mask2 = np.isin(paths2, overlapping_paths)

    corrs1_filtered = corrs1[mask1]
    labels1_filtered = labels1[mask1]
    paths1_filtered = paths1[mask1]

    corrs2_filtered = corrs2[mask2]
    labels2_filtered = labels2[mask2]
    paths2_filtered = paths2[mask2]

    return (corrs1_filtered, labels1_filtered, paths1_filtered,
            corrs2_filtered, labels2_filtered, paths2_filtered)

def align_corrs(corrs, labels, paths):
    # Sort by paths to ensure alignment
    sorted_indices = np.argsort(paths)
    corrs_sorted = corrs[sorted_indices]
    labels_sorted = labels[sorted_indices]
    paths_sorted = paths[sorted_indices]

    return corrs_sorted, labels_sorted, paths_sorted

def get_unique_markers(labels):
    markers = [label.split('_', 1)[0] for label in labels]
    return np.unique(markers)

# NIH

In [3]:
NOVA_model_path = "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/"
pretrained_model_path = "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model/"

experiment_foldername = "NIH"
folders_name = ['batch1_all_reps_WT_Untreated_all_markers', 'batch2_all_reps_WT_Untreated_all_markers', 'batch3_all_reps_WT_Untreated_all_markers']

In [4]:
nova_is_better_counter_nucleus, nova_is_better_counter_target = 0, 0

nova_corrs, nova_labels, nova_paths = load_batches_results(NOVA_model_path, experiment_foldername, folders_name)
print(nova_corrs.shape, nova_labels.shape, nova_paths.shape)

pretrained_corrs, pretrained_labels, pretrained_paths = load_batches_results(pretrained_model_path, experiment_foldername, folders_name)
print(pretrained_corrs.shape, pretrained_labels.shape, pretrained_paths.shape)

nova_corrs, nova_labels, nova_paths, pretrained_corrs, pretrained_labels, pretrained_paths = filter_by_overlapping_paths(nova_corrs, nova_labels, nova_paths,
                                                                                                                    pretrained_corrs, pretrained_labels, pretrained_paths)
nova_corrs, nova_labels, nova_paths = align_corrs(nova_corrs, nova_labels, nova_paths)
pretrained_corrs, pretrained_labels, pretrained_paths = align_corrs(pretrained_corrs, pretrained_labels, pretrained_paths)
print(nova_corrs.shape, nova_labels.shape, nova_paths.shape)
print(pretrained_corrs.shape, pretrained_labels.shape, pretrained_paths.shape)


unique_makers = get_unique_markers(nova_labels)
unique_markers = np.asarray([m for m in unique_makers if m != 'CD41'])
print(f"Unique markers: ({len(unique_makers)})", unique_makers)

for marker in unique_markers:
    n_idx = np.where(np.char.find(nova_labels, f"{marker}_") >= 0)[0]
    p_idx = np.where(np.char.find(pretrained_labels, f"{marker}_") >= 0)[0]

    n_l = nova_labels[n_idx]
    p_l = pretrained_labels[p_idx]

    n_path = nova_paths[n_idx]
    p_path = pretrained_paths[p_idx]

    n = nova_corrs[n_idx]
    p = pretrained_corrs[p_idx]

    t_stat, pval = paired_ttests(n, p, alternative='greater')

    if pval[0] <= 0.05:
        nova_is_better_counter_nucleus += 1
    if pval[1] <= 0.05:
        nova_is_better_counter_target += 1


nova_is_better_nucleus_per = nova_is_better_counter_nucleus * 100.0 / len(unique_markers)
nova_is_better_target_per = nova_is_better_counter_target * 100.0 / len(unique_markers)
print(f"NOVA > Pretrained (nucleus channel): {nova_is_better_nucleus_per}%")
print(f"NOVA > Pretrained (target channel): {nova_is_better_target_per}%")


(300920, 2, 3) (300920,) (300920,)
(300920, 2, 3) (300920,) (300920,)
(300920, 2, 3) (300920,) (300920,)
(300920, 2, 3) (300920,) (300920,)
Unique markers: (27) ['ANAX11' 'CD41' 'CLTC' 'Calreticulin' 'DAPI' 'DCP1A' 'FMRP' 'FUS' 'G3BP1'
 'GM130' 'KIF5A' 'LAMP1' 'MitoTracker' 'NCL' 'NEMO' 'P54' 'PEX14' 'PML'
 'PSD95' 'PURA' 'Phalloidin' 'SNCA' 'SQSTM1' 'TDP43' 'TIA1' 'TOMM20'
 'TUJ1']
NOVA > Pretrained (nucleus channel): 73.07692307692308%
NOVA > Pretrained (target channel): 76.92307692307692%


Alysa old

In [None]:
experiment_foldername = 'AlyssaCoyne'
# experiment_foldername = "NIH"
folders = ['batch1_all_reps_c9orf72ALSPatients_Untreated_without_MERGED', 'batch1_all_reps_Controls_Untreated_without_MERGED', 'batch1_all_reps_sALSNegativeCytoTDP43_Untreated_without_MERGED', 'batch1_all_reps_sALSPositiveCytoTDP43_Untreated_without_MERGED']
# 
# folders = ['batch1_all_reps_WT_Untreated_all_markers', 'batch2_all_reps_WT_Untreated_all_markers', 'batch3_all_reps_WT_Untreated_all_markers']

nova_all_batches = []
pretrained_all_batches = []

nova_labels_all_batches = []
pretrained_labels_all_batches = []

for folder in folders:
    print(folder)

    nova = np.load(f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/{experiment_foldername}/attn_correlations/{folder}/testset_corrs.npy")
    pretrained = np.load(f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model/figures/{experiment_foldername}/attn_correlations/{folder}/testset_corrs.npy")


    nova_labels = np.load(f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/{experiment_foldername}/attn_correlations/{folder}/testset_labels.npy")
    pretrained_labels = np.load(f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model/figures/{experiment_foldername}/attn_correlations/{folder}/testset_labels.npy")

    nova_idx = np.argsort(nova_labels)
    pretrained_idx = np.argsort(pretrained_labels)
    
    nova_labels = nova_labels[nova_idx]
    nova = nova[nova_idx]
    pretrained_labels = pretrained_labels[pretrained_idx]
    pretrained = pretrained[pretrained_idx]

    markers_unique = np.unique([l.split('_', 1)[0] for l in nova_labels])

    for marker in markers_unique:
        print(marker)
        n_idx = np.where(np.char.find(nova_labels, f"{marker}_") >= 0)[0]
        p_idx = np.where(np.char.find(pretrained_labels, f"{marker}_") >= 0)[0]

        n_l = nova_labels[n_idx]
        p_l = pretrained_labels[p_idx]

        
        print("# samples:", len(n_l), len(p_l))

        n = nova[n_idx]
        p = pretrained[p_idx]

        t_stat, pval = paired_ttests(n, p, alternative='greater')
        print(f"paired_ttest [NOVA greater than Pretrained] ({marker}):", "t_stat:", t_stat, "pval:", pval)
        print()