In [36]:
import numpy as np
from collections import Counter

import pandas as pd

In [52]:
def get_df(experiment_name, batches):
    data_batches = []
    for b in batches:
        batch_data = np.load(f"/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/embeddings/{experiment_name}/batch{b}/testset_labels.npy")
        data_batches.append(batch_data)
    return np.concatenate(data_batches)

def get_counts(df, markers=None, cell_lines_conditions=None, batches=None, combine_batches=True, combine_reps=True):
    # Split into columns
    df = pd.Series(df).str.split("_", expand=True)
    df.columns = ["marker", "cellline", "condition", "batch", "rep"]

    # Combine cellline and condition into one field
    df["cellline_condition"] = df["cellline"] + "_" + df["condition"]

    if markers is not None:
        df = df[df['marker'].isin(markers)]
    if cell_lines_conditions is not None:
        df = df[df['cellline_condition'].isin(cell_lines_conditions)]
    if batches is not None:
        df = df[df['batch'].isin(batches)]

    # Count occurrences (ignoring reps)
    groupby_fields = ["marker", "cellline_condition"]
    if not combine_batches:
        groupby_fields.append("batch")
    if not combine_reps:
        groupby_fields.append("rep")

    counts = (
        df.groupby(groupby_fields)
        .size()
        .reset_index(name="count")
    )

    return counts

# NIH

In [54]:
NIH_df = get_df("NIH", batches=[1,2,3])
markers = ['G3BP1', 'FMRP', 'MitoTracker', 'PML', 'TOMM20', 'DCP1A', 'PURA']
get_counts(NIH_df, markers=markers, cell_lines_conditions=['WT_Untreated', 'WT_stress'])


Unnamed: 0,marker,cellline_condition,count
0,DCP1A,WT_Untreated,6584
1,DCP1A,WT_stress,7347
2,FMRP,WT_Untreated,7013
3,FMRP,WT_stress,7391
4,G3BP1,WT_Untreated,7491
5,G3BP1,WT_stress,7290
6,MitoTracker,WT_Untreated,7059
7,MitoTracker,WT_stress,6996
8,PML,WT_Untreated,5738
9,PML,WT_stress,6361


# dNLS

In [55]:
dNLS_df = get_df("dNLS", batches=[1,2,4,5,6])
markers = ['TDP43', 'LSM14A', 'DCP1A']
get_counts(dNLS_df, markers=markers, cell_lines_conditions=['dNLS_Untreated', 'dNLS_DOX'])


Unnamed: 0,marker,cellline_condition,count
0,DCP1A,dNLS_DOX,8617
1,DCP1A,dNLS_Untreated,9691
2,LSM14A,dNLS_DOX,9510
3,LSM14A,dNLS_Untreated,8583
4,TDP43,dNLS_DOX,12530
5,TDP43,dNLS_Untreated,8507


# AlyssaCoyne Pilot

In [68]:
alyssaCoyne_df = get_df("AlyssaCoyne", batches=[1])
markers = ['TDP43', 'DCP1A', 'DAPI', 'Map2']
alyssaCoyne_counts = get_counts(alyssaCoyne_df, markers=markers, combine_reps=False)
print(alyssaCoyne_counts)
alyssaCoyne_counts.describe()


   marker               cellline_condition   rep  count
0    DAPI               Controls_Untreated  rep1    129
1    DAPI               Controls_Untreated  rep2    124
2    DAPI               Controls_Untreated  rep3     72
3    DAPI               Controls_Untreated  rep4    158
4    DAPI               Controls_Untreated  rep5    140
..    ...                              ...   ...    ...
79  TDP43  sALSPositiveCytoTDP43_Untreated  rep5    101
80  TDP43  sALSPositiveCytoTDP43_Untreated  rep6    111
81  TDP43  sALSPositiveCytoTDP43_Untreated  rep7    127
82  TDP43  sALSPositiveCytoTDP43_Untreated  rep8     98
83  TDP43  sALSPositiveCytoTDP43_Untreated  rep9     94

[84 rows x 4 columns]


Unnamed: 0,count
count,84.0
mean,108.547619
std,26.27675
min,62.0
25%,86.0
50%,106.0
75%,129.0
max,158.0


# AlyssaCoyne New

In [73]:
alyssaCoyne_new_df = get_df("AlyssaCoyne_new", batches=[1])
markers = ['TDP43', 'DCP1A']
alyssaCoyne_new_counts = get_counts(alyssaCoyne_new_df, markers=markers, combine_reps=False)
print(alyssaCoyne_new_counts)
alyssaCoyne_new_counts.describe()


   marker             cellline_condition   rep  count
0   DCP1A            C9-CS2YNL_Untreated  rep1     90
1   DCP1A            C9-CS2YNL_Untreated  rep2     81
2   DCP1A            C9-CS7VCZ_Untreated  rep1     57
3   DCP1A            C9-CS7VCZ_Untreated  rep2     65
4   DCP1A            C9-CS8RFT_Untreated  rep1     72
5   DCP1A            C9-CS8RFT_Untreated  rep2     63
6   DCP1A          Ctrl-EDi022_Untreated  rep1     57
7   DCP1A          Ctrl-EDi022_Untreated  rep2     68
8   DCP1A          Ctrl-EDi029_Untreated  rep1     65
9   DCP1A          Ctrl-EDi029_Untreated  rep2     62
10  DCP1A          Ctrl-EDi037_Untreated  rep1     40
11  DCP1A          Ctrl-EDi037_Untreated  rep2     47
12  DCP1A  SALSNegative-CS0ANK_Untreated  rep1     42
13  DCP1A  SALSNegative-CS0ANK_Untreated  rep2     48
14  DCP1A  SALSNegative-CS0JPP_Untreated  rep1     38
15  DCP1A  SALSNegative-CS0JPP_Untreated  rep2     36
16  DCP1A  SALSNegative-CS6ZU8_Untreated  rep1     44
17  DCP1A  SALSNegative-CS6Z

Unnamed: 0,count
count,48.0
mean,57.979167
std,16.188372
min,32.0
25%,43.5
50%,61.0
75%,68.25
max,90.0


In [77]:
alyssaCoyne_new_df = get_df("AlyssaCoyne_new", batches=[1])
markers = ['EEA1', 'GM130', 'NCL', 'LAMP1', 'CLTC', 'NEMO', 'Phalloidin', 'POM121', 'Calreticulin']
alyssaCoyne_new_counts = get_counts(alyssaCoyne_new_df, markers=markers, cell_lines_conditions=['C9-CS2YNL_Untreated', 'C9-CS7VCZ_Untreated', 'C9-CS8RFT_Untreated', 'Ctrl-EDi022_Untreated', 'Ctrl-EDi029_Untreated', 'Ctrl-EDi037_Untreated'], combine_reps=False)
print(alyssaCoyne_new_counts)
alyssaCoyne_new_counts.describe()



        marker     cellline_condition   rep  count
0         CLTC    C9-CS2YNL_Untreated  rep1     59
1         CLTC    C9-CS2YNL_Untreated  rep2     54
2         CLTC    C9-CS7VCZ_Untreated  rep1     68
3         CLTC    C9-CS7VCZ_Untreated  rep2     70
4         CLTC    C9-CS8RFT_Untreated  rep1     62
..         ...                    ...   ...    ...
91  Phalloidin  Ctrl-EDi022_Untreated  rep2     73
92  Phalloidin  Ctrl-EDi029_Untreated  rep1     51
93  Phalloidin  Ctrl-EDi029_Untreated  rep2     31
94  Phalloidin  Ctrl-EDi037_Untreated  rep1     30
95  Phalloidin  Ctrl-EDi037_Untreated  rep2     32

[96 rows x 4 columns]


Unnamed: 0,count
count,96.0
mean,57.489583
std,14.733417
min,27.0
25%,47.5
50%,57.5
75%,67.0
max,93.0


# NeuronsDay8_new

In [None]:
neuronsDay8_new_df = get_df("neuronsDay8_new", batches=[1,2,3,8,9])
markers = ['FUS']
cell_lines_conditions = ['TBK1_Untreated', 'OPTN_Untreated', 'FUSHomozygous_Untreated', 'TDP43_Untreated', 'FUSRevertant_Untreated']
neuronsDay8_new_counts = get_counts(neuronsDay8_new_df, markers=markers, cell_lines_conditions=cell_lines_conditions, combine_batches=False)

neuronsDay8_new_counts.describe()

Unnamed: 0,count
count,25.0
mean,4802.28
std,1238.382632
min,2459.0
25%,3844.0
50%,5017.0
75%,5689.0
max,7111.0


In [None]:
neuronsDay8_new_df = get_df("neuronsDay8_new", batches=[1,2,3,7,8,9])
markers = ['LSM14A']
cell_lines_conditions = ['TBK1_Untreated', 'OPTN_Untreated',  'TDP43_Untreated']
neuronsDay8_new_counts = get_counts(neuronsDay8_new_df, markers=markers, cell_lines_conditions=cell_lines_conditions, combine_batches=False)

neuronsDay8_new_counts

neuronsDay8_new_counts.describe()

Unnamed: 0,count
count,18.0
mean,4613.388889
std,936.992318
min,3305.0
25%,3990.75
50%,4486.5
75%,5202.75
max,6625.0


In [67]:
neuronsDay8_new_df = get_df("neuronsDay8_new", batches=[1,2,3,7,8,9])
markers = ['LAMP1']
cell_lines_conditions = ['TBK1_Untreated', 'OPTN_Untreated',  'FUSHomozygous_Untreated']
neuronsDay8_new_counts = get_counts(neuronsDay8_new_df, markers=markers, cell_lines_conditions=cell_lines_conditions, combine_batches=False)

neuronsDay8_new_counts

neuronsDay8_new_counts.describe()

Unnamed: 0,count
count,18.0
mean,1799.777778
std,662.376875
min,619.0
25%,1182.75
50%,1901.5
75%,2362.75
max,2729.0
