In [7]:
import os
import sys
import pandas as pd
import numpy as np

os.environ['NOVA_HOME'] = '/home/projects/hornsteinlab/Collaboration/NOVA/'
os.environ['NOVA_DATA_HOME'] = '/home/projects/hornsteinlab/Collaboration/NOVA/input'

sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")

from manuscript.effects_config import NeuronsEffectWTBaselineConfig
from src.datasets.dataset_config import DatasetConfig
from src.embeddings.embeddings_utils import load_embeddings



NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA/


In [8]:
class NeuronsDay8NEWConfig(DatasetConfig):
    def __init__(self):
        super().__init__()

        self.INPUT_FOLDERS = ["batch1", 'batch2', 'batch3', 'batch7', 'batch8', 'batch9']
        self.EXPERIMENT_TYPE = 'neuronsDay8_new'

        self.CELL_LINES = ['WT', 'OPTN', 'TBK1', 'FUSHomozygous', 'FUSHeterozygous', 'FUSRevertant', 'TDP43']
        self.CONDITIONS = ['Untreated']

class dNLSConfig(DatasetConfig):
    def __init__(self):
        super().__init__()
        self.INPUT_FOLDERS = ["batch1", 'batch2', 'batch4', 'batch5', 'batch6']
        self.EXPERIMENT_TYPE = 'dNLS'

        self.CELL_LINES = ['dNLS']

In [11]:

def prepare_embeddings_df(embeddings: np.ndarray[float], 
                           labels: np.ndarray[str], paths) -> pd.DataFrame:
        """
        Create a DataFrame with embeddings and metadata parsed from sample labels.
        Parses labels of the form "marker_cellline_condition_batch_rep" into separate columns.

        Args:
            embeddings (np.ndarray[float]):     Embeddings array of shape (n_samples, n_features).
            labels (np.ndarray[str]):           Array of label strings matching embedding rows.

        Returns:
            pd.DataFrame: DataFrame containing embedding columns plus columns:
                ['marker', 'cell_line', 'condition', 'batch', 'rep'].

        Raises:
            ValueError: If any label string does not contain exactly 5 underscore-separated parts.
        """
        df = pd.DataFrame(embeddings)
        df['label'] = labels
        df['site'] = [p.split(os.sep)[-2] for p in paths]
        df['tile_number'] = [int(p.split(os.sep)[-1]) for p in paths]

        # Split and validate
        split_labels = df['label'].str.split('_', expand=True)

        # Check that all labels have 5 parts
        if split_labels.shape[1] != 5:
            invalid_labels = df['label'][split_labels.isnull().any(axis=1)].tolist()
            raise ValueError(
                f"Some label strings are invalid (expected 5 parts separated by '_').\n"
                f"Example invalid labels: {invalid_labels[:5]}"
            )
        df[['marker', 'cell_line', 'condition', 'batch', 'rep']] = split_labels
        return df

def count_sites_all(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count unique sites for each (marker, cell_line, condition, batch).

    Args:
        df: DataFrame with columns ['marker', 'cell_line', 'condition', 'batch', 'site']

    Returns:
        DataFrame with columns ['marker', 'cell_line', 'condition', 'batch', 'n_sites']
    """
    result = (
        df.groupby(["marker", "cell_line", "condition", "batch"])["site"]
          .nunique()
          .reset_index(name="n_sites").sort_values(by=["marker", "batch"])
    )
    return result

def show_filtered_out(df: pd.DataFrame, min_sites_count:int):
    filtered = df[df["n_sites"] < min_sites_count]
    if not filtered.empty:
        print(f"Filtered out samples with less than {min_sites_count} sites:")
        print(filtered)
    else:
        print(f"All samples meet the minimum site count of {min_sites_count}.")

def get_counts_sites(config_data):
    output_folder_path = f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen"

    embeddings, labels, paths = load_embeddings(output_folder_path, config_data)
    print(f"Loaded embeddings: {embeddings.shape}, labels: {labels.shape}")

    embeddings_df = prepare_embeddings_df(embeddings, labels, paths)
    print(f"Prepared embeddings DataFrame: {embeddings_df.shape}")

    return count_sites_all(embeddings_df)



In [5]:
pd.set_option("display.max_rows", None)

In [12]:
dNLS_counts = get_counts_sites(dNLSConfig())
dNLS_counts

Loaded embeddings: (1134025, 192), labels: (1134025,)
Prepared embeddings DataFrame: (1134025, 200)


Unnamed: 0,marker,cell_line,condition,batch,n_sites
0,ANXA11,dNLS,DOX,batch1,627
5,ANXA11,dNLS,Untreated,batch1,652
1,ANXA11,dNLS,DOX,batch2,331
6,ANXA11,dNLS,Untreated,batch2,260
2,ANXA11,dNLS,DOX,batch4,712
7,ANXA11,dNLS,Untreated,batch4,683
3,ANXA11,dNLS,DOX,batch5,749
8,ANXA11,dNLS,Untreated,batch5,745
4,ANXA11,dNLS,DOX,batch6,397
9,ANXA11,dNLS,Untreated,batch6,233


In [13]:
show_filtered_out(dNLS_counts, 80)

Filtered out samples with less than 80 sites:
    marker cell_line  condition   batch  n_sites
45   DCP1A      dNLS  Untreated  batch1       20
49   DCP1A      dNLS  Untreated  batch6       40
145   NEMO      dNLS  Untreated  batch1       20
149   NEMO      dNLS  Untreated  batch6       40
164  PEX14      dNLS        DOX  batch6       72
169  PEX14      dNLS  Untreated  batch6       30
176    PML      dNLS  Untreated  batch2       38
179    PML      dNLS  Untreated  batch6       33
214   SNCA      dNLS        DOX  batch6       71
219   SNCA      dNLS  Untreated  batch6       30
246  TDP43      dNLS  Untreated  batch2       38
249  TDP43      dNLS  Untreated  batch6       33


In [14]:
neuronsDay8_new_counts = get_counts_sites(NeuronsDay8NEWConfig())
neuronsDay8_new_counts

Loaded embeddings: (6420755, 192), labels: (6420755,)
Prepared embeddings DataFrame: (6420755, 200)


Unnamed: 0,marker,cell_line,condition,batch,n_sites
0,ANXA11,FUSHeterozygous,Untreated,batch1,480
6,ANXA11,FUSHomozygous,Untreated,batch1,377
12,ANXA11,FUSRevertant,Untreated,batch1,427
18,ANXA11,OPTN,Untreated,batch1,415
24,ANXA11,TBK1,Untreated,batch1,438
30,ANXA11,TDP43,Untreated,batch1,485
36,ANXA11,WT,Untreated,batch1,447
1,ANXA11,FUSHeterozygous,Untreated,batch2,425
7,ANXA11,FUSHomozygous,Untreated,batch2,431
13,ANXA11,FUSRevertant,Untreated,batch2,421


In [15]:
show_filtered_out(neuronsDay8_new_counts, 80)

Filtered out samples with less than 80 sites:
    marker        cell_line  condition   batch  n_sites
255    FUS  FUSHeterozygous  Untreated  batch7        3
261    FUS    FUSHomozygous  Untreated  batch7        5
267    FUS     FUSRevertant  Untreated  batch7        5
273    FUS             OPTN  Untreated  batch7        1
279    FUS             TBK1  Untreated  batch7        5
285    FUS            TDP43  Untreated  batch7        3
291    FUS               WT  Untreated  batch7        4


In [None]:

pd.reset_option("display.max_rows")