In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import re
import utils
import functools

%load_ext autoreload
%autoreload 2

In [None]:
def read_data(directory, regex=None):
    directory = Path(directory)
    # some variables reused below
    extra_columns = ['ImageNumber', 'Parent_DilatedNucleoli']
    
    extras = {
        'extra_columns': extra_columns,
        'reduce': False,
        'merge_fcn': functools.partial(utils.merge_result, merge_on=extra_columns)
    }
    
    # Parse information from filename using the provided regex
    result, _ = utils.analyze(directory / 'Image.csv', 
                              parsers=[
                                  utils.ImageParser(regex, debug_regex=debug_regex),                              
                              ])
    # Combine with DilatedGC for using to merge with other measures
    result, _ = utils.analyze(directory / 'InitialNucleoli.csv',
                              previous_result=result,
                              parsers=[utils.BlankParser(['Parent_DilatedNucleoli'])],
                              extra_columns=['ImageNumber', ],
                              merge_fcn=functools.partial(utils.merge_result, merge_on=['ImageNumber'], how='left'),
                             )
    
    # Measure features from GC objects
    result, _ = utils.analyze(directory / 'InitialNucleoli.csv',
                              previous_result=result,
                              parsers=[
                                  utils.ShapeParser(),
                                  utils.IntensityParser(images=['NOP56', 'NPM1', 'starPlasmid', 'PF']),
                              ],
                              region='GC',
                              **extras
                             )
    
    # Measure features from nucleoplasm rim
    result, _ = utils.analyze(directory / 'NucleoplasmRim.csv',
                              previous_result=result,
                              parsers=[
                                  utils.IntensityParser(images=['NOP56', 'NPM1', 'starPlasmid', 'PF']),
                              ],
                              region='Rim',
                              **extras
                             )
    
    # Measure rdf
    result, extra = utils.analyze(directory / 'DilatedNucleoli.csv',
                              previous_result=result,
                              parsers=[
                                  utils.RDFParser(id_vars=['ImageNumber', 'ObjectNumber']),
                              ],
                              merge_fcn=lambda x: x
                             )
    
    return result, extra[0]
    
data, rdf = read_data('/scratch/gpfs/tcomi/cp_paper_redo/rim_rdf/testing/outputs', 
                      regex=r'/[A-G]\d+_+(?P<treatment>[^_]+).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10|NAT1).*nd2')
data.T

## RDF curves

In [None]:
to_keep = data.copy()
de_novo = rdf.merge(to_keep[['ImageNumber', 'Parent_DilatedNucleoli', 'treatment', 'target']], 
                    left_on=['ImageNumber', 'ObjectNumber'], 
                    right_on=['ImageNumber', 'Parent_DilatedNucleoli'])

In [None]:
# average raw values based on target and ssu
groups = ['treatment', 'target', 'channel', 'radius',]
channels = ['', 'target', 'NOP56', '18S', 'NPM1']
rdf_data = []
for name, dat in de_novo.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         

In [None]:
sns.relplot(data=rdf_data, x='radius', y='intensity', 
            col='channel', row='target', kind='line', style='treatment', 
            facet_kws=dict(sharex=True, sharey=False))

In [None]:
import aicsimageio
import urllib.parse
import warnings

radius = 100
to_show = to_show = data[
  (data.GC_area > 400) &
  (data.Center_X > radius) &
  (data.Center_Y > radius) &
  (data.Center_X < 2254 - radius) &
  (data.Center_Y < 2254 - radius)
].copy().sort_values(by='mean_Intensity_18Sstar_GC', ascending=False).reset_index(drop=True)

to_show.to_csv('manual_samples.csv')
selected = pd.read_csv('cropped_images/Lifei_SSUdenovo.csv')
# selected
# selected = to_show.iloc[selected['row number']]

# fig, axes = plt.subplots(1, 4, figsize=(12, 3))
fig, axes = plt.subplots(11, 8, figsize=(24, 33))
df = to_show.iloc[88::89]
rows = df.iterrows()
imgs = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")  # tight layout warning
    for page in range(1, len(df) // 88 + 2):
        for ax, (ind, row) in zip(axes.flatten(), rows):
            img_name = urllib.parse.unquote(row.Metadata_FileLocation)
            if img_name not in imgs:
                im = aicsimageio.imread(img_name).squeeze()
                # normalize
                imgs[img_name] = ((im - im.min(axis=(1, 2), keepdims=True)) / 
                    (im.max(axis=(1, 2), keepdims=True) - im.min(axis=(1, 2), keepdims=True))).copy()
            img = imgs[img_name]
            sub_img = np.zeros((radius*2, radius*2, 3))
            
            # channels = ['', 'target', 'Nop56', '18S', 'NPM1']
            x, y = int(row.Center_Y), int(row.Center_X)
            sub_img[:, :, 0] = img[1, x-radius:x+radius, y-radius:y+radius]  # red is nop56
            sub_img[:, :, 1] = img[2, x-radius:x+radius, y-radius:y+radius]  # green is 18S
            # sub_img[:, :, 2] = img[2, x-radius:x+radius, y-radius:y+radius]  # green is target
            ax.clear()
            ax.imshow(sub_img)
            ax.set_title(f'{ind} - {x} - {y} - {row.ImageNumber}')
            ax.set_axis_off()
            
        plt.tight_layout()
        plt.savefig(f'cropped_images/unseen_{page:02}.pdf')


In [None]:
selected = pd.read_csv('cropped_images/Lifei_SSUdenovo.csv')
exclude = pd.read_csv('cropped_images/subset_lifei_SSU_sofi_to_exclude.txt', header=None)
selected = selected[~selected['row number'].isin(exclude[0])]
selected = pd.concat((to_show.iloc[selected['row number']], to_show.iloc[1958:]))
selected.to_csv('cropped_images/selected_nucleoli.csv')

In [None]:
manual = pd.read_csv('cropped_images/selected_nucleoli.csv', index_col=0)
manual['nucleoli_type'] = 'de novo'
manual.loc[1958:, 'nucleoli_type'] = 'endogenous'

def found_in_manual(row):
    in_img = manual[manual.Metadata_FileLocation == row.Metadata_FileLocation]
    dists = scipy.spatial.distance.cdist(np.expand_dims(row[['Center_X', 'Center_Y']].to_numpy(float), axis=0), in_img[['Center_X', 'Center_Y']].to_numpy())
    return dists.size > 0 and dists.min() < 1
    
data = data[data.apply(found_in_manual, axis=1)]
len(data)

In [None]:
# add in class from manual    
def manual_class(row):
    in_img = manual[manual.Metadata_FileLocation == row.Metadata_FileLocation]
    dists = scipy.spatial.distance.cdist(np.expand_dims(row[['Center_X', 'Center_Y']].to_numpy(float), axis=0), in_img[['Center_X', 'Center_Y']].to_numpy())
    return in_img.iloc[dists.argmin()]['nucleoli_type']

data['nucleoli_type'] = data.apply(manual_class, axis=1)

In [None]:
data.to_csv('cropped_images/bkg_corr_nucleoli.csv')

In [None]:
crop_data, crop_rdf = read_data('/home/tcomi/Desktop/cp_outputs/SSU_test_raw', regex=r'SSU_IF/cropped/(?P<image_class>denoised|endogenous).*/[A-G]\d+_+(?P<SSU>WTSSU|mutSSU).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10).*nd2')
crop_data.loc[crop_data.image_class == 'denoised', 'image_class'] = 'de novo'

In [None]:
crop_data.groupby(['target', 'image_class', 'SSU']).Count_GC.count()
(crop_data.image_class != 'de novo').sum()
# crop_data.to_csv('cropped_ssu.csv')

In [None]:
# Use size distribution to estimate a high pass threshold

sns.histplot(x='GC_area', data=crop_data[crop_data.GC_area < 1000], hue='image_class')

In [None]:
# ax = sns.scatterplot(data=crop_data, x='partitioning_18Sstar', y='GC_area', hue='image_class')
ax = sns.scatterplot(data=crop_data, x='mean_Intensity_18Sstar_GC', y='GC_area', hue='image_class')

ax.axvline(0.0025)
ax.axvline(0.004)
ax.axhline(3000)
ax.axhline(300)

In [None]:
# average raw values based on target and ssu
merged_dat = crop_rdf.merge(crop_data[['ImageNumber', 'NucleolusNumber', 'SSU', 'target', 'image_class']], left_on=['ImageNumber', 'ObjectNumber'], right_on=['ImageNumber', 'NucleolusNumber'])
groups = ['SSU', 'target', 'channel', 'radius', 'image_class']
channels = ['', 'target', 'Nop56', '18S', 'NPM1']
rdf_data = []
for name, dat in merged_dat.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         
rdf_data

In [None]:
target_dat = rdf_data[
    (rdf_data.SSU == "WTSSU") &
    (rdf_data.image_class == "endogenous") &
    (rdf_data.channel == "target")
    ].copy()
target_dat['normalized_intensity'] = target_dat.groupby('target').intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

npm_dat = rdf_data[
        (rdf_data.SSU == "WTSSU") &
        (rdf_data.image_class == "endogenous") &
        (rdf_data.channel == "Nop56")
    ].copy()
npm_dat['normalized_intensity'] = npm_dat.groupby('target').intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

In [None]:
# all targets, only de novo, only WT, normalized
ax = sns.lineplot(
    data=target_dat,
    x='radius',
    y='normalized_intensity',
    hue='target',
)
sns.lineplot(
    data=npm_dat,
    x='radius',
    y='normalized_intensity',
    linestyle='--',
    c='k',
    ax=ax
)
             

In [None]:
crop_data.columns

In [None]:
lbl = 'GC_area'
sns.displot(data=crop_data, hue='image_class', col='SSU', kind='ecdf', x=lbl, facet_kws=dict(sharex=False))
plt.savefig(f"crop_SSU_{lbl}_dist.pdf")

In [None]:
data = []
rdf = []
d, r = read_data('SSU_IF_more_targets/SSU_more_targets/outputs/', regex=r'/[A-G]\d+_+(?P<SSU>[^_]+).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10|NAT10|ESF1|Nat10).*nd2')
data.append(d.assign(nucleoli_type='de novo'))
rdf.append(r)
d, r = read_data('SSU_IF_more_targets/SSU_more_targets_endogenous/outputs/', regex=r'/(?:best_)?[A-G]\d+_+(?P<SSU>[^_]+).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10|NAT10|ESF1|Nat10).*nd2')
data.append(d.assign(nucleoli_type='endogenous'))
rdf.append(r)
data = pd.concat(data, ignore_index=True)
rdf = pd.concat(rdf, ignore_index=True)
# fix typo, make consistent with other
data['target'] = data.target.replace('Nat10', 'NAT10')
data['SSU'] = data.SSU.replace('M437', 'SSUWT')
data['SSU'] = data.SSU.replace('M438', 'SSUMut')
data.to_csv('cropped_images/more_targets_nucleoli_renamed.csv')

In [None]:
sub_dat = data[data.Metadata_FileLocation.str.contains(
    # r'(F6_M438_NPM1647_Fib568_18S488_ESF1405_z001_crop3)|'
    r'(F6_M438_NPM1647_Fib568_18S488_ESF1405_z003_crop1%)'
    # r'(E6_M437_NPM1647_Fib568_18S488_ESF1405_zstack_crop3)|'
    # r'(E6_M437_NPM1647_Fib568_18S488_ESF1405_zstack_crop1)'
)]
print(
sub_dat[[
    'SSU',
    'mean_Intensity_GC_GC',
    'mean_Intensity_target_GC',
    
]])
sub_dat.Metadata_FileLocation.unique()
sub_dat.filter(like='mean_Intensity').T

In [None]:
sub_dat['Metadata_FileLocation'].unique()

In [None]:
raw = pd.read_csv('SSU_IF/SSU_more_targets/outputs/InitialNucleoli.csv')
raw.loc[raw.ImageNumber.isin([76, 78, 104, 109]), [c for c in raw.columns if c.startswith('Intensity_MeanIntensity_')]].T

In [None]:
sns.scatterplot(data=data, x='mean_Intensity_GC_GC', y='mean_Intensity_target_GC', hue='SSU', style='nucleoli_type')

In [None]:
# retain only de novo nucleoli
to_keep = data.copy()
de_novo = rdf.merge(to_keep[['ImageNumber', 'NucleolusNumber', 'SSU', 'target', 'nucleoli_type']], left_on=['ImageNumber', 'ObjectNumber'], right_on=['ImageNumber', 'NucleolusNumber'])

In [None]:
# average raw values based on target and ssu
groups = ['SSU', 'target', 'channel', 'radius', 'nucleoli_type']
channels = ['', 'NPM1', 'Fib568', '18S', 'target']
rdf_data = []
for name, dat in de_novo.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         

rdf_data.to_csv('cropped_images/more_targets_rdf_renamed.csv')

In [None]:
data, _ = read_data('SSU_IF/cropped_intens/outputs/', regex=r'/[A-G]\d+_+(?P<plasmid>[^_]+).*_(?P<star>28S|18S)488.*nd2')
data

In [None]:
sns.relplot(data=data, x='mean_Intensity_GC_GC', y='mean_Intensity_DFC_GC', hue='star', col='plasmid')

In [None]:
data.to_csv('GC_DFC_quantification.csv')

In [None]:
data.T[0]

In [None]:
data.drop(
    columns=[c for c in data.columns if 'target' in c]).rename(
    columns={c: c.replace('18S', '') for c in data.columns if '18Sstar' in c}).to_csv('GC_DFC_quantification.csv')

In [None]:
def add_rim(result, directory, common, bins=1, total=10):
    cols = open(directory / 'InitialNucleoli.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('RadialDistribution_FracAtD')
           ]
    distributions = pd.read_csv(
        directory / 'InitialNucleoli.csv', 
        usecols=common + cols,
    )

    bins = [i for i in range(total, total-bins, -1)]
    relative_areas = distributions[[f'RadialDistribution_FracAtD_InitialNucleoliObjectImage_{bin}of{total}' for bin in bins]].sum(axis=1)
    distributions['dfc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_DFC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
    distributions['fc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_FC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
        
    map_cols = {
        'dfc_rim_enrichment': 'dfc_rim_enrichment',
        'fc_rim_enrichment': 'fc_rim_enrichment',
    }
    result = merge_result(result, distributions, map_cols)

    return result

def read_rim_data(directory, regex=None):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should correspond to a single cell
    common = ['ImageNumber', 'ObjectNumber', 'Parent_DilatedNucleoli']

    result = build_initial_data(directory, common, regex)

    result = add_npm_partitioning(result, directory, common, other_name='FC')
    result = add_rim(result, directory, common, bins=4, total=20)
    
    return result.drop(columns='Parent_DilatedNucleoli')

data = read_rim_data('/scratch/gpfs/tcomi/cp_morphology_240617/SSU_rim/ssu_240801/outputs', 
                     regex=r'/SSU_rim/(?P<SSU>SSUWT|SSUmut)/.*nd2')
data[data.isna().any(axis=1)].Metadata_FileLocation.unique()
data

In [None]:
sns.scatterplot(data=data, hue='SSU', x='fc_rim_enrichment', y='dfc_rim_enrichment')

In [None]:
data.to_csv('ssu_rim_240801.csv')