In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
import sklearn.decomposition
import scipy

from pathlib import Path
import re

In [None]:
# helper function to add total intensity by multiplying the area with the intensity
def add_total_intens(df, channels, name):
    if isinstance(channels, str):
        channels = [channels]
    for channel in channels:
        df[f'total_intens_{channel}_{name}'] = df[f'Intensity_MeanIntensity_{channel}'] * df['AreaShape_Area']
    return df

# helper function to merge incoming dfs with the results by image number and cell number.  Can change reduction to be count, sum, etc.
# map cols renames the columns from the input to output DFs
def merge_reduced(result_df, df, map_cols, reduction, how='inner'):
    return result_df.merge(
        df.groupby(['ImageNumber', 'Parent_DilatedGC'])[list(map_cols.keys())].aggregate(reduction).reset_index().rename(columns=map_cols),
        on=['ImageNumber', 'Parent_DilatedGC'],
        how=how,
    )

def build_initial_data(directory, common, regex):
    gc = pd.read_csv(
        directory / 'InitialGC.csv', 
        usecols=common,
    )
    
    # read image metadata, mainly file location
    images = pd.read_csv(directory / 'Image.csv', usecols=['Metadata_FileLocation', 'ImageNumber', 'Metadata_Series'])
    
    # add in regex of filename (time, treatment, etc)
    if regex:
        images = images.join(images['Metadata_FileLocation'].str.extract(regex))
    
    # start building the final result, with image number and merged_gc number (renamed to cell number)
    result = gc[['ImageNumber', 'Parent_DilatedGC']].drop_duplicates().rename(columns={
        'Parent_DilatedGC': 'CellNumber'
    })
    
    # add image file location and regex info
    result = result.merge(images, on='ImageNumber', how="left")
    
    # gc number, done separately because the cellnumber doesn't match parent_DilatedGC any more
    result = result.merge(
        gc.groupby(['ImageNumber', 'Parent_DilatedGC'])['AreaShape_Area'].count().rename('Count_GC').reset_index(),
        left_on=['ImageNumber', 'CellNumber'], right_on=['ImageNumber', 'Parent_DilatedGC']
    )

    return result

def add_initial_gc(result, directory, common, dfc_intens=False):
    # read GC, plus intensities for GC and Probe and eccentricity
    gc = pd.read_csv(
        directory / 'InitialGC.csv', 
        usecols=common + ['Intensity_MeanIntensity_GC', 'Intensity_MeanIntensity_Probe', 
                          'AreaShape_Eccentricity', 'AreaShape_Perimeter',
                          'Location_CenterMassIntensity_X_GC',  'Location_CenterMassIntensity_Y_GC',
                         ] + (['Intensity_MeanIntensity_DFC',] if dfc_intens else []),
    )
    
    # gc intensity in gc  -> may not be the same, normalize to control in each folder
    gc = add_total_intens(gc, ['GC', 'Probe']+ (['DFC',] if dfc_intens else []), 'GC')
    
    # calculate circularity as 4 pi area / perimeter ** 2
    gc['AreaShape_Cicularity'] = 4 * np.pi * gc['AreaShape_Area'] / gc['AreaShape_Perimeter'] ** 2
    
    # gc shape and size
    map_cols = {
        'AreaShape_Eccentricity': 'mean_GC_eccentricity',
        'AreaShape_Cicularity': 'mean_GC_circularity',
        'Location_CenterMassIntensity_X_GC': 'center_x',
        'Location_CenterMassIntensity_Y_GC': 'center_y',
    }
    result = merge_reduced(result, gc, map_cols, 'mean')
    map_cols = {
        'total_intens_GC_GC': 'total_GC_intens',
        'total_intens_Probe_GC': 'total_Probe_intens_GC',
        'AreaShape_Area': 'GC_area',
    }
    if dfc_intens:
        map_cols['total_intens_DFC_GC'] = 'total_DFC_intens_GC'
    result = merge_reduced(result, gc, map_cols, 'sum')
    result['mean_mean_GC_intens'] = result['total_GC_intens']/ result['GC_area']
    result['mean_mean_Probe_intens'] = result['total_Probe_intens_GC']/ result['GC_area']
    if dfc_intens:
        result['mean_mean_DFC_intens'] = result['total_DFC_intens_GC']/ result['GC_area']

    return result

def add_initial_fc(result, directory, common):
    # read FC intensities, positions (not used yet) and number of children in each region.
    fc = pd.read_csv(
        directory / 'InitialFC.csv', 
        usecols=common + ['Intensity_MeanIntensity_FC', 'Intensity_MeanIntensity_Probe', 
                          'Location_CenterMassIntensity_X_FC', 'Location_CenterMassIntensity_Y_FC',],
    )
    
    # fc intensity in fc
    # size of fc
    fc = add_total_intens(fc, ['FC', 'Probe'], 'FC')

    # add fcs total
    result = merge_reduced(result, fc, {"AreaShape_Area": "Count_FC"}, 'count')
    
    map_cols = {
        'total_intens_FC_FC': 'total_FC_intens',
        'total_intens_FC_FC': 'total_Probe_intens_FC',
        'AreaShape_Area': 'FC_area',
    }
    result = merge_reduced(result, fc, map_cols, 'sum')
    result['FC_density'] = result['Count_FC']/ result['GC_area']

    # add fcs outside of GC
    fc = pd.read_csv(
        directory / 'ExtraNucleolarFCs.csv', 
        usecols=common,
    )
    result = merge_reduced(result, fc, {"AreaShape_Area": "Count_Nucleoplasmic_FC"}, 'count', how='left')
    result.loc[result['Count_Nucleoplasmic_FC'].isna(), 'Count_Nucleoplasmic_FC'] = 0

    return result


def add_rim(result, directory, common, dfc, bins=1, total=10):
    cols = open(directory / 'InitialGC.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('RadialDistribution_FracAtD')
            # or c.startswith('RadialDistribution_MeanFrac')
           ]
    distributions = pd.read_csv(
        directory / 'InitialGC.csv', 
        usecols=common + (cols if dfc else [c for c in cols if 'DFC' not in c]),
    )

    bins = [i for i in range(total, total-bins, -1)]
    relative_areas = distributions[[f'RadialDistribution_FracAtD_GCObjectImage_{bin}of{total}' for bin in bins]].sum(axis=1)
    distributions['fc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_FC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
    if dfc:
        distributions['dfc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_DFC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas

    # print(dists[[f'RadialDistribution_MeanFrac_DFC_{bin}of10' for bin in bins]].sum(axis=1))
    # print(relative_areas)
        
    map_cols = {
        'fc_rim_enrichment': 'fc_rim_enrichment',
        'dfc_rim_enrichment': 'dfc_rim_enrichment',
    } if dfc else {'fc_rim_enrichment': 'fc_rim_enrichment'}
    result = merge_reduced(result, distributions, map_cols, 'mean')

    return result

def add_correlation(result, directory, common):
    # correlations over combined regions
    cols = open(directory / 'CombinedObjects.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('Correlation_Correlation')
            or c.startswith('Correlation_Overlap')
           ]
    corr = pd.read_csv(
        directory / 'CombinedObjects.csv', 
        usecols=common + cols,
    )
    
    # multiply all correlations by area
    corr[cols] *= corr['AreaShape_Area'].to_numpy()[:, None]
    
    # sum and add to result
    map_cols = {c: c[12:] for c in cols}
    # map_cols = {c: c for c in cols}
    map_cols['AreaShape_Area'] = 'combined_area'
    result = merge_reduced(result, corr, map_cols, 'sum')
    
    # divide by total area for correlation
    result[[c for c in map_cols.values() if c != 'combined_area']] /= result['combined_area'].to_numpy()[:, None]
    return result
    
def read_data(directory, regex=None, dfc=True, bins=4, dfc_intens=False):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should corresopnd to a single cell
    common = ['ImageNumber', 'ObjectNumber', 'AreaShape_Area', 'Parent_DilatedGC']

    result = build_initial_data(directory, common, regex)
    
    result = add_initial_gc(result, directory, common, dfc_intens)
    
    result = add_initial_fc(result, directory, common)
    
    result = add_rim(result, directory, common, dfc, bins, total=20)
    
    result = add_correlation(result, directory, common)
    
    return result.drop(columns='Parent_DilatedGC')
    # fc position (stringyness?, graph morphology of fcs)

full_data = read_data('morphology/240820_FISH/outputs', r'/.*_10A_(?P<probe>[^_0]+)(?:\d{3}\d?)?.nd2', bins=4)
full_data.loc[full_data.isna().any(axis=1), 'Metadata_FileLocation'].unique()
full_data.probe.unique()

## mising Fib data

In [None]:
data = read_data('morphology/Fib_15c_control/outputs', r'/[A-G]\d+_MCF10A(?P<treatment>FibKD|ctl)_30p(?P<time>\d+)c.*nd2').assign(exp='Fib')
data.to_csv('fib_15c_control.csv')

# Initial analyses

In [None]:
dat = read_data('morphology/CX_EU/outputs', r'/10A_(?P<treatment>CX|ctl)_15p(?P<time>\d+)c.*nd2', dfc=False)

dat.loc[dat.isna().any(axis=1), 'Metadata_FileLocation'].unique()
# dat['Metadata_FileLocation'].unique()
# dat.loc[dat.isna().any(axis=1)].T

In [None]:
# need to parse out 5' ETS from CX
cx = read_data('morphology/CX/outputs', r'/[A-G]\d+_(?P<treatment>CX)(?P<time>\d+)min_(?P<probe>Junction1|18S|45S|Junction2|Junction4|ITS1|28S).*nd2').assign(exp='CX')
missing = cx.isna().any(axis=1)
parsed = cx.loc[missing, 'Metadata_FileLocation'].str.extract(r'/.*ETS_CX(?P<time>\d+)min.*nd2')
cx.loc[missing, 'treatment'] = 'CX'
cx.loc[missing, 'probe'] = '5ETS'
cx.loc[missing, 'time'] = parsed['time']
most_data = pd.concat([
    cx,
    read_data('morphology/Fib/outputs', r'/[A-G]\d+_MCF10A(?P<treatment>FibKD|ctl)_30p(?P<time>\d+)c.*nd2').assign(exp='Fib'),
    read_data('morphology/FVP_FISH/outputs', r'/[A-G]\d+_(?P<treatment>FVP)(?P<time>\d+)min_.*_(?P<probe>Junction1|18S|45S|Junction2|Junction4|ITS1|28S|3ETStile|ITS2b).*nd2').assign(exp='FVP_FISH'),
    read_data('morphology/FVP_EU/outputs', r'/[A-G]\d+_(?P<treatment>FVP|DMSO)_15p(?P<time>\d+)c.*nd2').assign(exp='FVP_EU'),
    read_data('morphology/RPL5/outputs', r'/[A-G]\d+_(?P<treatment>SCR|RPL5KD)_15p(?P<time>\d+)c.*nd2').assign(exp='RPL5'),
    read_data('morphology/RPL5_FISH/outputs', r'/[A-G]\d+_(?P<treatment>RPL5KD|SCR)_(?P<probe>Junction1|18S|45S|Junction2|Junction4|ITS1|28S|3ETStile|ITS2b|5ETS|5S|Junction3).*nd2').assign(exp='RPL5_FISH'),
    read_data('morphology/U3U8_EU/outputs', r'/[A-G]\d+__?(?P<treatment>U3ASO|ctlASO|U8ASO)_15p(?P<time>\d+)c.*nd2').assign(exp='U3U8_EU'),
    read_data('morphology/U3U8/outputs', r'/[A-G]\d+_(?P<treatment>U3ASO|SCR|U8ASO)_(?P<probe>U3FISH|U8FISH).*nd2').assign(exp='U3U8'),
    read_data('morphology_nodfc/FVP_nodfc/outputs', r'/[A-G]\d+_(?P<treatment>DMSO|FVP2uM)_15p(?P<time>\d+)c.*nd2', dfc=False).assign(exp='FVP_nodfc'),
    read_data('morphology/CX_EU/outputs', r'/10A_(?P<treatment>CX|ctl)_15p(?P<time>\d+)c.*nd2').assign(exp='CX_EU'),
]).reset_index(drop=True)

In [None]:
full_data = most_data
full_data['time'] = full_data['time'].astype(float)

In [None]:
# full_data.to_csv('morphology.csv', index=False)
full_data.groupby(['exp', 'treatment']).Count_GC.count()

## Look at untreated cells

In [None]:
df = full_data[full_data.treatment.isin(('DMSO', 'SCR', 'ctlASO', 'ctl',))].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not c.startswith('filename') 
          and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'exp']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='exp', ax = ax)

## CX

In [None]:
df = full_data[full_data.exp == 'CX'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='time', ax = ax)

## CX EU

In [None]:
df = full_data[full_data.exp == 'CX_EU'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FibKD', 'ctl'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
# sns.lineplot(data=df, x='time', y='RWC_FC_Probe', label='FC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_DFC_Probe', label='DFC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_GC_Probe', label='GC_Probe', style='treatment', legend=False)
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_CX.pdf')

## Fib

In [None]:
df = full_data[full_data.exp == 'Fib'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FibKD', 'ctl'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
# sns.lineplot(data=df, x='time', y='RWC_FC_Probe', label='FC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_DFC_Probe', label='DFC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_GC_Probe', label='GC_Probe', style='treatment', legend=False)
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fib.pdf')

## FVP EU

In [None]:
df = full_data[full_data.exp == 'FVP_EU'].copy()
# drop some data from another experiment...
df = df[~df.time.isna()]
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fvp.pdf')

## FVP no DFC

In [None]:
df = full_data[full_data.exp == 'FVP_nodfc'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'DFC' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP2uM', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fvp_nodfc.pdf')

## FVP FISH

In [None]:
df = full_data[full_data.exp == 'FVP_FISH'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='probe')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='probe')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='probe')

## RPL5

In [None]:
df = full_data[full_data.exp == 'RPL5'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('RPL5KD', 'SCR'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
# sns.lineplot(data=df, x='time', y='RWC_FC_Probe', label='FC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_DFC_Probe', label='DFC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_GC_Probe', label='GC_Probe', style='treatment', legend=False)
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_rpl5.pdf')

## RPL5_FISH

In [None]:
df = full_data[full_data.exp == 'RPL5_FISH'].copy()
df
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('RPL5KD', 'SCR'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='probe', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.barplot(data=df, x='probe', y='Correlation_FC_Probe', hue='treatment')
plt.xticks(rotation=60)
fig, axes = plt.subplots()
sns.barplot(data=df, x='probe', y='Correlation_DFC_Probe', hue='treatment')
plt.xticks(rotation=60)
fig, axes = plt.subplots()
sns.barplot(data=df, x='probe', y='Correlation_GC_Probe', hue='treatment')
plt.xticks(rotation=60)

## U8 with FISH probe

In [None]:
df = full_data[full_data.probe == 'U8FISH'].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          # and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
df = df[(df.mean_mean_Probe_intens < 0.002) | (df.treatment == 'SCR')]
print(df.groupby(['exp', 'treatment', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='treatment', ax=ax)
    
fig, axes = plt.subplots()
sns.ecdfplot(data=df[df.treatment != 'SCR'], x='mean_mean_Probe_intens', hue='treatment', ax=axes)
axes.set_xlim(0, 0.0025)
fig, axes = plt.subplots()
# sns.kdeplot(data=df, x='dfc_rim_enrichment', hue='treatment', ax=axes)
sns.ecdfplot(data=df, x='dfc_rim_enrichment', hue='treatment', ax=axes)

## U3 with FISH probe

In [None]:
df = full_data[full_data.probe == 'U3FISH'].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          # and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
df = df[(df.mean_mean_Probe_intens < 0.005) | (df.treatment == 'SCR')]
print(df.groupby(['exp', 'treatment', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='treatment', ax=ax)
    
fig, axes = plt.subplots()
sns.ecdfplot(data=df[df.treatment != 'SCR'], x='mean_mean_Probe_intens', hue='treatment', ax=axes)
fig, axes = plt.subplots()
sns.ecdfplot(data=df, x='fc_rim_enrichment', hue='treatment', ax=axes)

## U3 U8 EU

In [None]:
df = full_data[full_data.exp == 'U3U8_EU'].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          # and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='treatment', ax=ax)

plt.subplots()
sns.kdeplot(data=df, x='fc_rim_enrichment', hue='treatment')

## Normalize each experiment by the "control" means

- CX t=0 is no treatment and t=120 is 2hr treatment so it makes sense to compare them - should give you the biggest changes
- For RPL5, all time points should have equal morphology changes - you can average all time points for morphology measurement (or maybe pick one time point if enough cells and if different time points look close enough) but for the probe channel which is EU, we need its correlation with GC overtime (edited) 
- FVP has a pretreatment of 1hr and morph progress with time, for morph purpose, can pick one time point and compare DMSO vs FVP? Like the 60min time point  (edited) 
- U3 and U8 should be two separate dataset, and SCR can be divided into two based on whether it belongs to U3 or U8.  Threshold for knockdown 0.002 for U8, 0.005 for U3

In [None]:
full_data.exp.unique()

In [None]:
# normalize by control fold change
toplot = [c for c in full_data.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]

def normalize(df, experiment, columns, loc_col, control, treated):
    df = df[df.exp == experiment].copy()
    df.loc[df[loc_col] == treated, columns] = df.loc[df[loc_col] == treated, columns] / df.loc[df[loc_col] == control, columns].mean()
    return df[df[loc_col] == treated]

normalized = pd.concat([
    normalize(full_data, 'CX', toplot, 'time', 0, 120),
    normalize(full_data, 'Fib', toplot, 'treatment', 'ctl', 'FibKD'),
    normalize(full_data[full_data.time == 15], 
              'FVP_EU', toplot, 'treatment', 'DMSO', 'FVP'),
    normalize(full_data[(full_data.probe == 'U3FISH') & ((full_data.mean_mean_Probe_intens < 0.005) | (full_data.treatment == 'SCR'))], 
              'U3U8', toplot, 'treatment', 'SCR', 'U3ASO').assign(exp='U3'),
    normalize(full_data[(full_data.probe == 'U8FISH') & ((full_data.mean_mean_Probe_intens < 0.002) | (full_data.treatment == 'SCR'))], 
              'U3U8', toplot, 'treatment', 'SCR', 'U8ASO').assign(exp='U8'),
    normalize(full_data, 'RPL5', toplot, 'treatment', 'SCR', 'RPL5KD'),
],
    ignore_index=True)
normalized.to_csv('normalized_fold.csv', index=False)

# df = full_data[(full_data.exp == 'U3U8') & (full_data.probe == 'U3FISH') & ((full_data.mean_mean_Probe_intens < 0.005) | (full_data.treatment == 'SCR'))].copy()
# df.loc[df.treatment == 'FVP2uM', toplot] = np.log2(df.loc[df.treatment == 'FVP2uM', toplot] / df.loc[df.treatment == 'DMSO', toplot].mean())
# df = df[df.treatment == 'FVP2uM']
# df

normalized.exp.unique()

In [None]:
# normalize by control z scores
toplot = [c for c in full_data.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]

def normalize(df, experiment, columns, loc_col, control, treated):
    df = df[df.exp == experiment].copy()
    cntr = df.loc[df[loc_col] == control, columns]
    df.loc[df[loc_col] == treated, columns] = (df.loc[df[loc_col] == treated, columns] - cntr.mean()) / cntr.std()
    return df[df[loc_col] == treated]

normalized = pd.concat([
    normalize(full_data, 'CX', toplot, 'time', 0, 120),
    normalize(full_data, 'Fib', toplot, 'treatment', 'ctl', 'FibKD'),
    normalize(full_data[full_data.time == 15], 
              'FVP_EU', toplot, 'treatment', 'DMSO', 'FVP'),
    normalize(full_data[(full_data.probe == 'U3FISH') & ((full_data.mean_mean_Probe_intens < 0.005) | (full_data.treatment == 'SCR'))], 
              'U3U8', toplot, 'treatment', 'SCR', 'U3ASO').assign(exp='U3'),
    normalize(full_data[(full_data.probe == 'U8FISH') & ((full_data.mean_mean_Probe_intens < 0.002) | (full_data.treatment == 'SCR'))], 
              'U3U8', toplot, 'treatment', 'SCR', 'U8ASO').assign(exp='U8'),
    normalize(full_data, 'RPL5', toplot, 'treatment', 'SCR', 'RPL5KD'),
],
    ignore_index=True)
normalized.to_csv('normalized_z.csv', index=False)

# df = full_data[(full_data.exp == 'U3U8') & (full_data.probe == 'U3FISH') & ((full_data.mean_mean_Probe_intens < 0.005) | (full_data.treatment == 'SCR'))].copy()
# df.loc[df.treatment == 'FVP2uM', toplot] = np.log2(df.loc[df.treatment == 'FVP2uM', toplot] / df.loc[df.treatment == 'DMSO', toplot].mean())
# df = df[df.treatment == 'FVP2uM']
# df

normalized.exp.unique()

In [None]:
full_data[full_data.exp == 'FVP_EU'].groupby(['treatment', 'time']).Count_FC.count()

In [None]:
n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=normalized, x=col, hue='exp', ax=ax)
    
fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.violinplot(data=normalized, y=col, x='exp', ax=ax)

In [None]:
lut = dict(zip(normalized.exp.unique(), sns.color_palette()))
row_colors = normalized.exp.map(lut)
# dat = normalized.groupby('exp')[toplot].median(numeric_only=False).dropna(axis=1).T
# dat = np.log2(normalized.groupby('exp')[toplot].median(numeric_only=False)).replace(-np.inf, np.nan).dropna(axis=1).T
# dat /= (dat.abs().to_numpy()).max(axis=1, keepdims=True)
sns.clustermap(
    # np.log2(normalized.groupby('exp')[toplot].mean(numeric_only=False)).dropna(axis=1).T,
    # np.log2(normalized.groupby('exp')[toplot].median(numeric_only=False)).replace(-np.inf, np.nan).dropna(axis=1).T,
    # normalized.groupby('exp')[toplot].mean(numeric_only=False).dropna(axis=1).T,
    normalized.groupby('exp')[toplot].median(numeric_only=False).dropna(axis=1).T,
    # normalized[toplot].dropna(axis=1).clip(-10, 10),
    # dat,
    cmap='vlag',
    center = 0,
    annot=True,
    # row_colors=row_colors,
)

# handles = [Patch(facecolor=lut[name]) for name in lut]
# plt.legend(handles, lut, title='Treatment',
#            bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper right')

In [None]:
# qualitative heat map

lut = dict(zip(normalized.exp.unique(), sns.color_palette()))
row_colors = normalized.exp.map(lut)
# dat = np.log2(normalized.groupby('exp')[toplot].median(numeric_only=False)).replace(-np.inf, np.nan).dropna(axis=1).T.copy()
# cutoffs = [np.log2(1.25), np.log2(2), np.log2(4)]
dat = normalized.groupby('exp')[toplot].median(numeric_only=False).dropna(axis=1).T.copy()
cutoffs = [1, 2, 4]
dat[np.abs(dat.to_numpy()) < cutoffs[0]] = 0
for i in range(1, len(cutoffs)):
    dat[(dat.to_numpy() > cutoffs[i-1]) & (dat.to_numpy() < cutoffs[i])] = i
    dat[(dat.to_numpy() < -cutoffs[i-1]) & (dat.to_numpy() > -cutoffs[i])] = -i

dat[dat.to_numpy() > cutoffs[i]] = i+1
dat[dat.to_numpy() < -cutoffs[i]] = -i-1
dat_map = ['0']
dat_map += [f"${'+'*(i+1)}$" for i in range(len(cutoffs))]
dat_map += [f"${'-'*(i)}$" for i in range(len(cutoffs), 0, -1)]

sns.clustermap(
    dat,
    cmap='vlag',
    center = 0,
    annot=np.vectorize(lambda x: dat_map[x])(dat.astype(int)),
    fmt="",
)

##  plot the correlation metric for EU vs GC or FISH vs GC

In [None]:
full_data.groupby('exp').sample(1)

In [None]:
exp_plot_map = {
    'CX': {
        'x': 'time', 'hue': 'probe',
    },
    'FVP_nodfc': {
        'x': 'time', 'hue': 'treatment',
    },
    'FVP_FISH': {
        'x': 'time', 'hue': 'probe',
    },
    'Fib': {
        'x': 'time', 'hue': 'treatment',
    },
    'RPL5': {
        'x': 'time', 'hue': 'treatment',
    },
    'U3U8_EU': {
        'x': 'time', 'hue': 'treatment',
    },
}
for exp, params in exp_plot_map.items():
    fig, ax = plt.subplots()
    dat = full_data[full_data.exp == exp]
    sns.lineplot(data=dat, y='Correlation_GC_Probe', **params, ax=ax)
    ax.set_title(exp)

fig, ax = plt.subplots()
dat = full_data[full_data.exp == 'RPL5_FISH']
sns.barplot(data=dat, y='Correlation_GC_Probe', x='probe', hue='treatment', ax=ax, hue_order=['SCR', 'RPL5KD'])
plt.xticks(rotation=70)


fig, ax = plt.subplots()
dat = full_data[(full_data.exp == 'FVP_FISH')]
sns.barplot(data=dat, y='Correlation_GC_Probe', x='probe', hue='time', ax=ax)
plt.xticks(rotation=70)

In [None]:
dat = full_data[full_data.exp.isin(('FVP_nodfc', 'Fib', 'RPL5', 'U3U8_EU'))].copy()
dat['treated'] = 'control'
dat.loc[~dat['treatment'].isin(('ctl', 'DMSO', 'SCR', 'ctlASO')), 'treated'] = 'treated'
sns.lineplot(data=dat, y='Correlation_GC_Probe', x='time', hue='exp', style='treated')


In [None]:
dat.groupby(['exp', 'treated', 'time'])['Count_FC'].count()

## U3U8_EU, filter on dfc rim enrichment

In [None]:
# U3U8_EU, filter on dfc rim enrichment
dat = full_data[full_data.exp == 'U3U8_EU'].copy()
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.ecdfplot(x='fc_rim_enrichment', data=dat, hue='treatment', ax=ax[0])
sns.ecdfplot(x='dfc_rim_enrichment', data=dat, hue='treatment', ax=ax[1])

fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.kdeplot(x='fc_rim_enrichment', data=dat, hue='treatment', ax=ax[0])
ax[0].axvline(0.9)
sns.kdeplot(x='dfc_rim_enrichment', data=dat, hue='treatment', ax=ax[1])
fig.suptitle('Pre Filter')
dat_filt = dat[
    ~((dat.treatment == 'U3ASO') & (dat.fc_rim_enrichment < 0.9)) &
    ~((dat.treatment == 'U8ASO') & (dat.fc_rim_enrichment < 0.9))
]
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.kdeplot(x='fc_rim_enrichment', data=dat_filt, hue='treatment', ax=ax[0])
sns.kdeplot(x='dfc_rim_enrichment', data=dat_filt, hue='treatment', ax=ax[1])
fig.suptitle('Post Filter')

fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.lineplot(data=dat, y='Correlation_GC_Probe', x='time', hue='treatment', ax=ax[0])
ax[0].set_title('Pre Filter')
sns.lineplot(data=dat_filt, y='Correlation_GC_Probe', x='time', hue='treatment', ax=ax[1])
ax[1].set_title('Post Filter')

## Random Forest

In [None]:
normalized[
    ((normalized.time == 0) | (normalized.time == 90)) &
    normalized.treated &
    (normalized.exp == '3 color')
].groupby('treatment').Count_GC.count()

In [None]:
sub_data = normalized[
    ((normalized.time == 0) | (normalized.time == 90)) &
    normalized.treated &
    (normalized.exp == '3 color')
].groupby('treatment').sample(frac=1).groupby('treatment').head(200)
features = sub_data[toplot].copy()
features = features.fillna(0)
labels = sub_data['treatment']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(train_features, train_labels)

In [None]:
predictions = clf.predict(test_features)
lbls = test_labels.unique()
mat = sklearn.metrics.confusion_matrix(test_labels, predictions, labels=lbls, normalize='true')
print(np.diag(mat).sum() / mat.sum())
disp = sklearn.metrics.ConfusionMatrixDisplay(mat, display_labels=lbls)
disp.plot()

In [None]:
acc = {}
for depth in range(2, 20):
    clf = RandomForestClassifier(max_depth=depth, random_state=0)
    clf.fit(train_features, train_labels)
    predictions = clf.predict(test_features)
    mat = sklearn.metrics.confusion_matrix(test_labels, predictions, labels=lbls, normalize='true')
    acc[depth] = np.diag(mat).mean()

In [None]:
ax = sns.lineplot(x=acc.keys(), y=acc.values())
ax.set_xlabel('Max Tree Depth')
ax.set_ylabel('Test Accuracy')
ax.axvline(7)
ax.axhline(acc[7])

In [None]:
clf = RandomForestClassifier(max_depth=7, random_state=0)
clf.fit(train_features, train_labels)

predictions = clf.predict(test_features)
lbls = test_labels.unique()
mat = sklearn.metrics.confusion_matrix(test_labels, predictions, labels=lbls, normalize='true')
print(np.diag(mat).sum() / mat.sum())
disp = sklearn.metrics.ConfusionMatrixDisplay(mat, display_labels=lbls)
disp.plot()

In [None]:
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(features.columns, clf.feature_importances_)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {:0.3f}'.format(*pair)) for pair in feature_importances];

ax = sns.barplot(x=[x[0] for x in feature_importances], y=[x[1] for x in feature_importances])
plt.xticks(rotation=50, ha='right')
ax.set_ylabel('Gini Importance')

## PCA

In [None]:
pca = sklearn.decomposition.PCA(n_components=2)
principalComponents = pca.fit_transform(features)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
pcs = pd.concat(
    (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'], index=features.index),
     labels), axis=1,
)
sns.scatterplot(x='PC1', y='PC2', hue='treatment', data=pcs.groupby('treatment').mean(), ax=ax[0])
ax[1].set_xlim(-1, 1)
ax[1].set_ylim(-1, 1)
for comp, lbl in zip(pca.components_.T, features.columns):
    ax[1].arrow(0, 0, comp[0], comp[1], color='k', alpha=0.5)
    ax[1].text(comp[0]* 1.15, comp[1] * 1.15, lbl, color='k', ha='center', va='center')

In [None]:
norms = np.linalg.norm(pca.components_, axis=0)
idx = np.argsort(-norms)
sns.barplot(x=features.columns[idx], y=norms[idx])
plt.xticks(rotation=50, ha='right') ;

# morph plate 7/3

3 color:
- D2-D5  FVP 0min, 30min, 60min, 90min (U3 FISH)
- D6-D9 FVP 0min, 30min, 60min, 90min (U8 FISH)
- F2-F5  CX 0min, 30min, 60min, 90min (U3 FISH)
- F6-F9 CX 0min, 30min, 60min, 90min (U8 FISH)
- E2-E9: U3ASO (U3FISH), U3SCR(U3FISH), U8ASO(U8FISH), U8SCR(U8FISH); FibsiRNA(U3FISH); Fibctl(U3FISH); RPL5shRNA(U8FISH); RPL5ctl (U8FISH)

no color:
- C2-C5  FVP 0min, 30min, 60min, 90min (U3 FISH)
- C6-C9 FVP 0min, 30min, 60min, 90min (U8 FISH)
- E2-E5  CX 0min, 30min, 60min, 90min (U3 FISH)
- E6-E9 CX 0min, 30min, 60min, 90min (U8 FISH)
  
in the morph_nocolor subfolder:
- C2-C9: C2- U3ASO (U3FISH), C3-U3SCR(U3FISH), C4-U8ASO(U8FISH), C5-U8SCR(U8FISH); C6-FibsiRNA(U3FISH); C7-Fibctl(U3FISH); C8-RPL5shRNA(U8FISH); C9-RPL5ctl (U8FISH)

In [None]:
data = read_data('morphology/morph_3color/outputs', r'.*/(?P<subdir>[^/]+)/Well(?P<well>[A-G]\d+)_.*nd2', bins=4, dfc_intens=True).assign(exp='3 color')
well_map = {
    'D02': ('FVP',  0, 'U3Fish'),
    'D03': ('FVP', 30, 'U3Fish'),
    'D04': ('FVP', 60, 'U3Fish'),
    'D05': ('FVP', 90, 'U3Fish'),
    'D06': ('FVP',  0, 'U8Fish'),
    'D07': ('FVP', 30, 'U8Fish'),
    'D08': ('FVP', 60, 'U8Fish'),
    'D09': ('FVP', 90, 'U8Fish'),
    
    'F02': ('CX',  0, 'U3Fish'),
    'F03': ('CX', 30, 'U3Fish'),
    'F04': ('CX', 60, 'U3Fish'),
    'F05': ('CX', 90, 'U3Fish'),
    'F06': ('CX',  0, 'U8Fish'),
    'F07': ('CX', 30, 'U8Fish'),
    'F08': ('CX', 60, 'U8Fish'),
    'F09': ('CX', 90, 'U8Fish'),
    
    'E02': ('U3ASO', 0, 'U3Fish'),
    'E03': ('U3SCR', 0, 'U3Fish'),
    'E04': ('U8ASO', 0, 'U8Fish'),
    'E05': ('U8SCR', 0, 'U8Fish'),
    'E06': ('FibsiRNA',  0, 'U3Fish'),
    'E07': ('Fibctl', 0, 'U3Fish'),
    'E08': ('RPL5shRNA', 0, 'U8Fish'),
    'E09': ('RPL5ctl', 0, 'U8Fish'),
}
color_data = data.join(
    pd.DataFrame([v for v in data.well.map(well_map)], columns=['treatment', 'time', 'probe']),
)
color_data = color_data[color_data.well != 'E09']  # bad sample

In [None]:
data = read_data('morphology_altchan/morph_nocolor/outputs', r'.*/(?P<subdir>[^/]+)/Well(?P<well>[A-G]\d+)_.*nd2', bins=4, dfc_intens=True).assign(exp='no color')

well_map = {
    'C02': ('FVP',  0, 'U3Fish'),
    'C03': ('FVP', 30, 'U3Fish'),
    'C04': ('FVP', 60, 'U3Fish'),
    'C05': ('FVP', 90, 'U3Fish'),
    'C06': ('FVP',  0, 'U8Fish'),
    'C07': ('FVP', 30, 'U8Fish'),
    'C08': ('FVP', 60, 'U8Fish'),
    'C09': ('FVP', 90, 'U8Fish'),
    
    'E02': ('CX',  0, 'U3Fish'),
    'E03': ('CX', 30, 'U3Fish'),
    'E04': ('CX', 60, 'U3Fish'),
    'E05': ('CX', 90, 'U3Fish'),
    'E06': ('CX',  0, 'U8Fish'),
    'E07': ('CX', 30, 'U8Fish'),
    'E08': ('CX', 60, 'U8Fish'),
    'E09': ('CX', 90, 'U8Fish'),
}

data = data.join(
    pd.DataFrame([v for v in data.well.map(well_map)], columns=['treatment', 'time', 'probe']),
)

# morph nocolor subdir
nocolor = data[data.subdir == 'morph_nocolor']
well_map = {
    'C02': ('U3ASO', 0, 'U3Fish'),
    'C03': ('U3SCR', 0, 'U3Fish'),
    'C04': ('U8ASO', 0, 'U8Fish'),
    'C05': ('U8SCR', 0, 'U8Fish'),
    'C06': ('FibsiRNA',  0, 'U3Fish'),
    'C07': ('Fibctl', 0, 'U3Fish'),
    'C08': ('RPL5shRNA', 0, 'U8Fish'),
    'C09': ('RPL5ctl', 0, 'U8Fish'),
}
    
nocolor = pd.DataFrame([v for v in nocolor.well.map(well_map)], columns=['treatment', 'time', 'probe'])
data.loc[
    data.subdir == 'morph_nocolor',
    ['treatment', 'time', 'probe'],
] = nocolor.to_numpy()

# add in newer U3/U8 data
no_color_redo = read_data('morphology_altchan/morph_nocolor_redo/outputs', r'.*/(?P<subdir>[^/]+)/(?P<well>[A-G]\d+)_(?P<treatment>[^0]+).*nd2',
                          bins=2, dfc_intens=True).assign(exp='no color redo', time=0)

data = pd.concat([data, color_data, no_color_redo], ignore_index=True)

# split out control vs treatment and treatment name to simplify facets
data['treated'] = data.time != 0  # initialize
data.loc[
    data.treatment.isin(['FibsiRNA', 'U3ASO', 'U8ASO', 'RPL5shRNA']),
    'treated'] = True
data['treatment'] = data.treatment.str.replace(r'ctl|siRNA|shRNA|ASO|SCR$', '', regex=True)

In [None]:
# remove U3/U8 no color data, replace with repeat
data = data[~((data.exp == 'no color') & (data.treatment.isin(('U3', 'U8'))))]
data.loc[data.exp == 'no color redo', 'exp'] = 'no color'
data.groupby(['exp', 'treatment', 'treated']).Count_GC.count()

In [None]:
data.groupby(['exp', 'treatment', 'treated']).Count_GC.count()
data.to_csv('morph_plate_U3U8_redo.csv', index=False)

In [None]:
data.columns

### KD efficiency

In [None]:
to_plot = data[data.treatment.isin(('U3', 'U8'))].copy()

sns.displot(
    data=to_plot,
    x='mean_mean_Probe_intens',
    col='exp',
    row='treatment',
    hue='treated',
    kind='ecdf',
    facet_kws=dict(sharex='row'),
)

In [None]:
to_plot = data[data.treatment.isin(('Fib',))].copy()

sns.displot(
    data=to_plot,
    x='mean_mean_DFC_intens',
    col='exp',
    row='treatment',
    hue='treated',
    kind='ecdf',
    facet_kws=dict(sharex='row'),
)

In [None]:
data.to_csv('morph_plate.csv', index=False)

In [None]:
data.columns

- Maybe the heat map for the key parameters across perturbations
- Also to see if all controls are clustered together

In [None]:
toplot = [c for c in data.columns[6:] 
              # if (c.startswith('Count') or
              # c.startswith('Correlation') or
              # c.startswith('mean_mean_GC') or
              # c in ('GC_area', 'mean_GC_circularity',) or
              # c.endswith('enrichment')) and
             if 'Probe' not in c 
          and c not in ( 'exp',
 'treatment',
 'time',
 'probe',
 'treated',
 'center_x',
 'center_y',
                       )
          and not c.startswith('total')
          and 'DFC_intens' not in c
         ]
toplot

In [None]:
dat = data[
    ((data.treatment == 'U3') & ((data.mean_mean_Probe_intens < 0.002) | ~data.treated)) |
    ((data.treatment == 'U8') & (data.exp == '3 color') & ((data.mean_mean_Probe_intens < 0.00157) | ~data.treated)) |
    ((data.treatment == 'U8') & (data.exp == 'no color') & ((data.mean_mean_Probe_intens < 0.0018) | ~data.treated)) |
    ((data.treatment == 'Fib') & ((data.mean_mean_DFC_intens < 0.003) | ~data.treated)) |
    ~data.treatment.isin(('U3', 'U8', 'Fib'))
].copy()

df = dat[dat.exp == 'no color'].copy()
print(df.groupby(['treatment', 'treated']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
order = df.treatment.unique()
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df[df.treated == False], x=col, hue='treatment', linestyle='--', ax=ax, hue_order=order)
    # sns.ecdfplot(data=df[df.treated == False], x=col, c='k', linestyle='--', ax=ax, hue_order=order)
    # sns.ecdfplot(data=df[df.treated == True], x=col, hue='treatment', linestyle='-', ax=ax, hue_order=order)

plt.savefig('noColorMorphRedo.pdf')

In [None]:
dat = data[
    ((data.treatment == 'U3') & ((data.mean_mean_Probe_intens < 0.002) | ~data.treated)) |
    ((data.treatment == 'U8') & (data.exp == '3 color') & ((data.mean_mean_Probe_intens < 0.00157) | ~data.treated)) |
    ((data.treatment == 'U8') & (data.exp == 'no color') & ((data.mean_mean_Probe_intens < 0.0018) | ~data.treated)) |
    ((data.treatment == 'Fib') & ((data.mean_mean_DFC_intens < 0.003) | ~data.treated)) |
    ~data.treatment.isin(('U3', 'U8', 'Fib', 'CX'))
].copy()

# normalize within each treatment
# normalized = dat.groupby(['treatment', 'exp'], group_keys=False)[toplot + ['treated']].apply(
#     lambda x: (x - x[x.treated == False].mean()) / x[x.treated == False].std()).drop(columns='treated')
# normalize against all 
normalized = dat.groupby(['exp'], group_keys=False)[toplot + ['treated']].apply(
    lambda x: (x - x[x.treated == False].mean()) / x[x.treated == False].std()).drop(columns='treated')
normalized = normalized.join(dat[['treatment', 'time', 'exp', 'treated']])
normalized.groupby(['treatment', 'time', 'treated', 'exp']).Count_GC.count()
# normalized.to_csv('morph_plate_normalized_redo.csv', index=False)

In [None]:
# df = normalized[normalized.treatment == 'CX'].copy()
df = data[data.treatment == 'FVP'].copy()
df['time'] = df['time'].astype(int)
print(df.groupby(['exp', 'treatment', 'time']).Count_GC.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for exp, line in zip(('no color', '3 color'), ('-', '--')):
        sns.ecdfplot(data=df[df.exp==exp], x=col, hue='time', ax = ax, linestyle=line)

In [None]:
dat = normalized[
    ((normalized.time == 0) | (normalized.time == 90)) &
    normalized.treated &
    (normalized.exp == '3 color')
].copy()
print(dat.groupby(['treatment', 'time', 'treated', 'exp']).Count_GC.count())
dat = dat.groupby('treatment')[toplot].agg({
     'Count_GC': 'mean',
     'Count_FC': 'mean',
     'Count_Nucleoplasmic_FC': 'mean',
     'mean_GC_circularity': 'median',
     'GC_area': 'median',
     'mean_mean_GC_intens': 'median',
     'fc_rim_enrichment': 'median',
     'dfc_rim_enrichment': 'median',
     'Correlation_DFC_FC': 'median',
     'Correlation_DFC_GC': 'median',
     'Correlation_FC_GC': 'median',
     'Overlap_DFC_FC': 'median',
     'Overlap_DFC_GC': 'median',
     'Overlap_FC_GC': 'median',
     'mean_GC_eccentricity': 'median',
     'FC_area': 'median',
     'FC_density': 'median',
     'combined_area': 'median'
}).T
# dat = dat.groupby('treatment')[toplot].mean().T
sns.clustermap(
    dat,
    cmap='vlag',
    center = 0,
    annot=True,
)
dat.to_csv('3_color_heatmap_redo_noCX.csv')
plt.savefig('3_color_heatmap_redo_noCX.pdf')

In [None]:
sub_data = normalized[
    ((normalized.time == 0) | (normalized.time == 90)) &
    normalized.treated &
    (normalized.exp == 'no color')
].copy()
features = sub_data[toplot].copy()
# features = features.fillna(0)
labels = sub_data['treatment']

pca = sklearn.decomposition.PCA(n_components=2)
principalComponents = pca.fit_transform(features)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
pcs = pd.concat(
    (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'], index=features.index),
     labels), axis=1,
)
sns.scatterplot(x='PC1', y='PC2', hue='treatment', data=pcs.groupby('treatment').mean(), ax=ax[0])
ax[1].set_xlim(-1, 1)
ax[1].set_ylim(-1, 1)
for comp, lbl in zip(pca.components_.T, features.columns):
    ax[1].arrow(0, 0, comp[0], comp[1], color='k', alpha=0.5)
    ax[1].text(comp[0]* 1.15, comp[1] * 1.15, lbl, color='k', ha='center', va='center')

# plt.savefig('no_color_pca_redo.pdf')

# CP RDF data

In [None]:
def add_rdf(result, directory, common):
    cols = open(directory / 'InitialGC.csv', 'r').readline().split(',')
    nucl = pd.read_csv(
        directory / 'InitialGC.csv',
        usecols=['ImageNumber', 'ObjectNumber', 'Parent_DilatedGC'] + [c.strip() for c in cols if c.startswith('RDF_')],
        )
    # intensity
    nucl = nucl.melt(id_vars=['ImageNumber', 'ObjectNumber', 'Parent_DilatedGC'])
    rdf = nucl[nucl.variable.str.startswith('RDF_Intensity')].reset_index(drop=True)
    extract = rdf.variable.str.extract(r'RDF_Intensity_C(\d)_R([-0-9]+)')
    rdf = rdf.assign(
        channel=extract[0].astype(int),
        radius=extract[1].astype(int),
    ).rename(columns={'value': 'intensity'}).drop(columns='variable')

    counts = nucl[nucl.variable.str.startswith('RDF_Count')].reset_index(drop=True)
    extract = counts.variable.str.extract(r'RDF_Counts_R([-0-9]+)')
    counts = counts.assign(
        radius=extract[0].astype(int)
    ).rename(columns={'value': 'counts'}).drop(columns='variable')

    rdf = rdf.merge(counts, on=['ImageNumber', 'ObjectNumber', 'Parent_DilatedGC', 'radius'])

    return result, rdf

def read_data(directory, regex=None, dfc=True, bins=4, dfc_intens=False):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should corresopnd to a single cell
    common = ['ImageNumber', 'ObjectNumber', 'AreaShape_Area', 'Parent_DilatedGC']

    result = build_initial_data(directory, common, regex)
    
    result = add_initial_gc(result, directory, common, dfc_intens)
    
    result = add_initial_fc(result, directory, common)
    
    result = add_rim(result, directory, common, dfc, bins, total=20)
    
    result = add_correlation(result, directory, common)

    result, rdf = add_rdf(result, directory, common)
    
    return result.drop(columns='Parent_DilatedGC'), rdf

data, rdf = read_data('morphology_rdf/RPL5/outputs/',  r'/[A-G]\d+_(?P<treatment>SCR|RPL5KD)_15p(?P<time>\d+)c.*nd2')
data

In [None]:
rdf

In [None]:
# need to average GCs from each parent
rdf_avg = []
groups = ['ImageNumber', 'Parent_DilatedGC', 'channel', 'radius']
for name, dat in rdf.groupby(groups):
    rdf_avg.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        counts=dat['counts'].sum(),
    ))
rdf_avg = pd.DataFrame(rdf_avg)
rdf_avg

In [None]:

rdf_avg = pd.DataFrame(rdf_avg)
rdf_avg

In [None]:
# get cell information
merged = rdf_avg.merge(data[['ImageNumber', 'CellNumber', 'time', 'treatment']], 
                   left_on=['ImageNumber', 'Parent_DilatedGC'], 
                   right_on=['ImageNumber', 'CellNumber'])
# average raw values based on target and ssu
groups = ['time', 'treatment', 'channel', 'radius']
channels = ['', 'EU', 'DFC', 'FC', 'GC']
rdf_data = []
for name, dat in merged.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)  
rdf_data['time'] = rdf_data.time.astype(int)

In [None]:
sns.relplot(data=rdf_data, x='radius', y='intensity', col='channel', 
            kind='line', style='treatment', hue='time', facet_kws=dict(sharex=True, sharey=False))

In [None]:
normalized = rdf_data.copy()
normalized['normalized_intensity'] = normalized.groupby(['channel', 'treatment', 'time']).intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )
sns.relplot(data=normalized, x='radius', y='normalized_intensity', col='channel', 
            kind='line', style='treatment', hue='time', facet_kws=dict(sharex=True, sharey=False))

In [None]:
sns.relplot(data=normalized, x='radius', y='normalized_intensity', col='time', col_wrap=3,
            kind='line', style='treatment', hue='channel', facet_kws=dict(sharex=True, sharey=False))

In [None]:
normalized.to_csv('rpl5_rdf_cp.csv')

# More Fish data

In [None]:
data = read_data('morphology/240824_FISH/outputs', r'/.*_10A_(?P<probe>[^_0]+)(?:\d{3}\d?)?.nd2', bins=4)
# data.loc[data.isna().any(axis=1), 'Metadata_FileLocation'].unique()
data.probe.unique()

In [None]:
toplot = [c for c in data.columns if c.startswith('Correlation')]
n_cols = 3
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
order = data.probe.unique()
for col, ax in zip(toplot, axes.flatten()):
    # sns.ecdfplot(data=data[data.Correlation_DFC_FC > 0.2], x=col, hue='probe', ax=ax, hue_order=order, legend=col.endswith('GC_Probe'))
    sns.ecdfplot(data=data, x=col, hue='probe', ax=ax, hue_order=order, legend=col.endswith('GC_Probe'))

In [None]:
data.to_csv('240824_FISH.csv')

In [None]:
data['filename'] = data.Metadata_FileLocation.str.extract(r'.*/(.+).nd2')[0]

In [None]:
bimodal_probes = data[data.Correlation_DFC_FC < 0.2].groupby('probe').Correlation_DFC_FC.count().sort_values(ascending=False).head(10).index

In [None]:
data[data.Correlation_DFC_FC < 0.2].groupby('filename').Correlation_DFC_FC.count().sort_values(ascending=False).head(25)

In [None]:
sns.displot(data=data[data.probe.isin(bimodal_probes)], col='probe', col_wrap=4, x='Correlation_DFC_FC', kind='ecdf', hue='filename', legend=False)

In [None]:
data[data.probe.isin(bimodal_probes)].groupby('filename').Correlation_DFC_FC.mean().sort_values(ascending=True).head(20)