In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import re
import utils
import functools

%load_ext autoreload
%autoreload 2

In [None]:
# helper function to merge incoming dfs with the results by image number and cell number.
def merge_result(result_df, df, map_cols, merge_on='Parent_DilatedNucleoli'):
    return result_df.merge(
        df[['ImageNumber', merge_on] + list(map_cols.keys())].rename(columns=map_cols),
        on=['ImageNumber', merge_on],
    )

def build_initial_data(directory, common, regex):
    gc = pd.read_csv(
        directory / 'InitialNucleoli.csv', 
        usecols=common + ['AreaShape_Area'],
    )
    
    # read image metadata, mainly file location
    images = pd.read_csv(directory / 'Image.csv', usecols=['Metadata_FileLocation', 'ImageNumber', 'Metadata_Series'])
    
    # add in regex of filename (time, treatment, etc)
    if regex:
        images = images.join(images['Metadata_FileLocation'].str.extract(regex))
    
    # start building the final result, with image number and merged_gc number (renamed to cell number)
    result = gc[['ImageNumber', 'Parent_DilatedNucleoli']].drop_duplicates().rename(columns={
        'Parent_DilatedNucleoli': 'NucleolusNumber'
    })
    
    # add image file location and regex info
    result = result.merge(images, on='ImageNumber', how="left")
    
    # gc number, done separately because the cellnumber doesn't match Parent_DilatedNucleoli any more
    result = result.merge(
        gc.groupby(['ImageNumber', 'Parent_DilatedNucleoli'])['AreaShape_Area'].count().rename('Count_GC').reset_index(),
        left_on=['ImageNumber', 'NucleolusNumber'], right_on=['ImageNumber', 'Parent_DilatedNucleoli']
    )

    return result

def add_npm_partitioning(result, directory, common, other_name='IF'):
    # read npm partitioning from intial and rim nucleoli
    gc = pd.read_csv(
        directory / 'InitialNucleoli.csv', 
        usecols=common + ['Intensity_MeanIntensity_GC', f'Intensity_MeanIntensity_{other_name}',
                          'Intensity_MeanIntensity_DFC',
                          'Intensity_MeanIntensity_star18S', 'AreaShape_Area', 
                          'AreaShape_Eccentricity', 'AreaShape_Perimeter',
                          'AreaShape_Center_X', 'AreaShape_Center_Y',
                         ],
    )
    
    # calculate circularity as 4 pi area / perimeter ** 2
    gc['AreaShape_Cicularity'] = 4 * np.pi * gc['AreaShape_Area'] / gc['AreaShape_Perimeter'] ** 2
        
    map_cols = {
        'AreaShape_Eccentricity': 'mean_GC_eccentricity',
        'AreaShape_Cicularity': 'mean_GC_circularity',
        'Intensity_MeanIntensity_GC': 'mean_Intensity_GC_GC',
        'Intensity_MeanIntensity_DFC': 'mean_Intensity_DFC_GC',
        f'Intensity_MeanIntensity_{other_name}': 'mean_Intensity_target_GC',
        'Intensity_MeanIntensity_star18S': 'mean_Intensity_18Sstar_GC',
        'AreaShape_Area': 'GC_area',
        'AreaShape_Center_X': 'Center_X',
        'AreaShape_Center_Y': 'Center_Y',
    }
    result = merge_result(result, gc, map_cols)

    nucleoplasm = pd.read_csv(
        directory / 'NucleoplasmRim.csv', 
        usecols=common + ['Intensity_MeanIntensity_GC', f'Intensity_MeanIntensity_{other_name}',
                          'Intensity_MeanIntensity_star18S',],
    )
    map_cols = {
        'Intensity_MeanIntensity_GC': 'mean_Intensity_GC_nucleoplasm',
        f'Intensity_MeanIntensity_{other_name}': 'mean_Intensity_target_nucleoplasm',
        'Intensity_MeanIntensity_star18S': 'mean_Intensity_18Sstar_nucleoplasm',
    }
    result = merge_result(result, nucleoplasm, map_cols)

    smoothing = 0.00#1
    for marker in ('GC', 'target', '18Sstar'):
        result[f'partitioning_{marker}'] = (result[f'mean_Intensity_{marker}_GC'] + smoothing) / (result[f'mean_Intensity_{marker}_nucleoplasm'] + smoothing)
    return result

def add_rdf(result, directory, common):
    cols = open(directory / 'DilatedNucleoli.csv', 'r').readline().split(',')
    nucl = pd.read_csv(
        directory / 'DilatedNucleoli.csv',
        usecols=['ImageNumber', 'ObjectNumber'] + [c.strip() for c in cols if c.startswith('RDF_')],
        )
    # intensity
    nucl = nucl.melt(id_vars=['ImageNumber', 'ObjectNumber'])
    rdf = nucl[nucl.variable.str.startswith('RDF_Intensity')].reset_index(drop=True)
    extract = rdf.variable.str.extract(r'RDF_Intensity_C(\d)_R([-0-9]+)')
    rdf = rdf.assign(
        channel=extract[0].astype(int),
        radius=extract[1].astype(int),
    ).rename(columns={'value': 'intensity'}).drop(columns='variable')

    counts = nucl[nucl.variable.str.startswith('RDF_Count')].reset_index(drop=True)
    extract = counts.variable.str.extract(r'RDF_Counts_R([-0-9]+)')
    counts = counts.assign(
        radius=extract[0].astype(int)
    ).rename(columns={'value': 'counts'}).drop(columns='variable')

    rdf = rdf.merge(counts, on=['ImageNumber', 'ObjectNumber', 'radius'])

    return result, rdf
    
def read_data(directory, regex=None):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should correspond to a single cell
    common = ['ImageNumber', 'ObjectNumber', 'Parent_DilatedNucleoli']

    result = build_initial_data(directory, common, regex)

    result = add_npm_partitioning(result, directory, common)
    result, rdf = add_rdf(result, directory, common)
    
    return result.drop(columns='Parent_DilatedNucleoli'), rdf
    
data, rdf = read_data('SSU_IF/SSU_IF/outputs/', regex=r'/[A-G]\d+_+(?P<SSU>WTSSU|mutSSU).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10).*nd2')
# data = data[~data.isna().any(axis=1)].reset_index(drop=True)
data[data.isna().any(axis=1)].Metadata_FileLocation.unique()
# data.groupby(['SSU', 'target']).ImageNumber.count()
# dat.Metadata_FileLocation.unique()
# merged.to_csv('SSU_IF.csv')

## Initial analyses

In [None]:
grid = sns.relplot(data=data, x='mean_Intensity_18Sstar_GC', y='GC_area', col='target', col_wrap=2, hue='SSU', alpha=0.2)
for ax in grid.axes.flat:
    ax.axvline(0.00075)
    ax.axvline(0.002)
    ax.axhline(3000)
    ax.axhline(300)


In [None]:
data['nucleoli_type'] = 'other'
data.loc[(data['partitioning_GC'] < 1.5) & (data['partitioning_18Sstar'] > 1.5), 'nucleoli_type'] = 'de novo'
# data.loc[(data['partitioning_18Sstar'] > 1.5), 'nucleoli_type'] = 'de novo'
data.loc[(data['partitioning_18Sstar'] < 1.1), 'nucleoli_type'] = 'endogenous'

In [None]:
data.columns

In [None]:
grid = sns.relplot(data=data, x='mean_Intensity_18Sstar_GC', y='GC_area', hue='nucleoli_type', row='nucleoli_type', col='SSU', alpha=0.2)

for ax in grid.axes.flat:
    
    ax.axvline(0.0025)
    ax.axvline(0.004)
    # ax.axvline(0.00075)
    # ax.axvline(0.002)
    ax.axhline(3000)
    ax.axhline(300)

In [None]:
sns.displot(data=data, hue='SSU', x='mean_Intensity_GC_GC', kind='ecdf', col='nucleoli_type', facet_kws=dict(sharex='row', sharey='row'))
sns.displot(data=data, hue='nucleoli_type', x='mean_Intensity_GC_GC', kind='ecdf', col='SSU', facet_kws=dict(sharex='row', sharey='row'))
# sns.displot(data=data, hue='SSU', x='mean_GC_circularity', kind='ecdf', col='nucleoli_type', facet_kws=dict(sharex='row', sharey='row'))
# sns.displot(data=data, hue='SSU', x='mean_GC_eccentricity', kind='ecdf', col='nucleoli_type', facet_kws=dict(sharex='row', sharey='row'))
# sns.violinplot(data=data, x='nucleoli_type', y='GC_area', hue='SSU') 

In [None]:
sns.scatterplot(data=data, x='partitioning_18Sstar', y='partitioning_GC', hue='nucleoli_type')

In [None]:
sns.displot(data=data, row='target', col='SSU', x='mean_Intensity_target_GC', kind='ecdf', hue='nucleoli_type', facet_kws=dict(sharex='row', sharey='row'))

In [None]:
sns.displot(data=data[data.nucleoli_type == 'de novo'], col='target', col_wrap=2, x='partitioning_target', kind='ecdf', hue='SSU', facet_kws=dict(sharex=False, sharey=False))

In [None]:
sns.displot(data=data[data.nucleoli_type == 'endogenous'], col='target', col_wrap=2, x='partitioning_target', kind='ecdf', hue='SSU', facet_kws=dict(sharex=False, sharey=False))

In [None]:
data.to_csv('SSU_IF.csv', index=False)

In [None]:

sns.relplot(data=data, x='mean_Intensity_target_GC', y='mean_Intensity_target_nucleoplasm', row='target', col='nucleoli_type', hue='SSU', facet_kws=dict(sharex='row', sharey='row'))

## RDF curves

In [None]:
# retain only de novo nucleoli
to_keep = data.copy()#[data.nucleoli_type == 'de novo'].copy()
de_novo = rdf.merge(to_keep[['ImageNumber', 'NucleolusNumber', 'SSU', 'target', 'nucleoli_type']], left_on=['ImageNumber', 'ObjectNumber'], right_on=['ImageNumber', 'NucleolusNumber'])

In [None]:
# average raw values based on target and ssu
groups = ['SSU', 'target', 'channel', 'radius', 'nucleoli_type']
channels = ['', 'target', 'Nop56', '18S', 'NPM1']
rdf_data = []
for name, dat in de_novo.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         

In [None]:
rdf_data.to_csv('cropped_images/rdf_data.csv')

In [None]:
sns.relplot(data=rdf_data, x='radius', y='intensity', col='channel', row='target', kind='line', style='SSU', hue='nucleoli_type', facet_kws=dict(sharex=True, sharey=False))

In [None]:
sns.relplot(
    data=rdf_data,
    x='radius',
    y='intensity',
    style='SSU',
    hue='channel',
    col='nucleoli_type',
    kind='line',
    row='target',
    facet_kws=dict(sharex=True, sharey=False),
)

In [None]:
ssu = 'mutSSU'
nuc = 'endogenous'
target_dat = rdf_data[
    (rdf_data.SSU == ssu) &
    (rdf_data.nucleoli_type == nuc) &
    (rdf_data.target.isin(('DDX21', 'EXOSC10', 'RPS6'))) &
    (rdf_data.channel == "target")
    ].copy()
target_dat['normalized_intensity'] = target_dat.groupby('target').intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

other_dat = rdf_data[
        (rdf_data.SSU == ssu) &
        (rdf_data.nucleoli_type == nuc) &
        (rdf_data.channel.isin(("Nop56", "18S")))
    ].copy()
other_dat['normalized_intensity'] = other_dat.groupby(['target', 'channel']).intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

In [None]:
# all targets, only de novo, only WT, normalized
ax = sns.lineplot(
    data=target_dat,
    x='radius',
    y='normalized_intensity',
    hue='target',
)
sns.lineplot(
    data=other_dat,
    x='radius',
    y='normalized_intensity',
    hue='channel',
    linestyle='--',
    ax=ax
)
plt.savefig(f'{ssu}_{nuc.replace(" ", "_")}.pdf')             

In [None]:
# all targets, only de novo, only WT, normalized
g = sns.relplot(
    data=target_dat,
    x='radius',
    y='normalized_intensity',
    hue='target',
    kind='line',
    col='target',
    col_wrap=2,
)

for ax in g.axes.flat:
    sns.lineplot(
        data=npm_dat,
        x='radius',
        y='normalized_intensity',
        linestyle='--',
        c='k',
        ax=ax,
)
             

In [None]:
other_dat = rdf_data[
        (rdf_data.target.isin(('DDX21', 'EXOSC10'))) &
        (rdf_data.nucleoli_type == 'de novo') &
        (rdf_data.channel.isin(("18S",)))
    ].copy()
other_dat['normalized_intensity'] = other_dat.groupby(['SSU', 'nucleoli_type']).intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

In [None]:
sns.relplot(
    data=other_dat,
    x='radius',
    y='normalized_intensity',
    hue='SSU',
    kind='line',
    row='nucleoli_type',
)

## Sample images

In [None]:
import aicsimageio
import urllib.parse

In [None]:
ax = sns.ecdfplot(data=data, x='GC_area', hue='nucleoli_type')
ax.set_xlim(left=0, right=1000)

In [None]:
thresh_dat = []
for cutoff in range(200, 1001, 50):
    thresh_dat.append(
        {
            'cutoff': cutoff,
            'f_endogenous': len(data[(data.GC_area < cutoff) & (data.nucleoli_type == 'endogenous')]) / len(data[data.nucleoli_type == 'endogenous']),
            'f_denovo': len(data[(data.GC_area < cutoff) & (data.nucleoli_type == 'de novo')]) / len(data[data.nucleoli_type == 'de novo']),
        })
pd.DataFrame(thresh_dat)

In [None]:
to_show = data[
  # (data.target == 'EXOSC10') &
  (data.GC_area < 600) &
  (data.GC_area > 400) &
  (data.Center_X > 50) &
  (data.Center_Y > 50) &
  (data.Center_X < 2204) &
  (data.Center_Y < 2204) &
  (data.nucleoli_type.isin(('de novo', 'endogenous')))
].copy().groupby(['SSU', 'nucleoli_type']).head(16)
to_show.iloc[0]

In [None]:
fig, axes = plt.subplots(8, 8, figsize=(24, 24))
radius = 50
import warnings
with warnings.catch_warnings(action="ignore"):  # ignore division warning in normalization
    for (ind, row), ax in zip(to_show.iterrows(), axes.flatten()):
        img = aicsimageio.imread(urllib.parse.unquote(row.Metadata_FileLocation)).squeeze()
        sub_img = np.zeros((radius*2, radius*2, 3))
        
        # channels = ['', 'target', 'Nop56', '18S', 'NPM1']
        x, y = int(row.Center_Y), int(row.Center_X)
        sub_img[:, :, 0] = img[1, x-radius:x+radius, y-radius:y+radius]  # red is nop56
        sub_img[:, :, 1] = img[0, x-radius:x+radius, y-radius:y+radius]  # green is target
        # sub_img[:, :, 2] = img[2, x-radius:x+radius, y-radius:y+radius]  # green is target
        # normalize
        sub_img = (sub_img - sub_img.min(axis=(0, 1), keepdims=True)) / (sub_img.max(axis=(0, 1), keepdims=True) - sub_img.min(axis=(0, 1), keepdims=True))
        ax.imshow(sub_img)
        ax.set_title(f"{row.SSU} - {row.nucleoli_type}")
        ax.set_axis_off()
        
    plt.tight_layout()
plt.savefig('sample_images.pdf')

## Show all images, filter < 400, order by 18S intensity

In [None]:
import aicsimageio
import urllib.parse
import warnings

radius = 100
to_show = to_show = data[
  (data.GC_area > 400) &
  (data.Center_X > radius) &
  (data.Center_Y > radius) &
  (data.Center_X < 2254 - radius) &
  (data.Center_Y < 2254 - radius)
].copy().sort_values(by='mean_Intensity_18Sstar_GC', ascending=False).reset_index(drop=True)

to_show.to_csv('manual_samples.csv')
selected = pd.read_csv('cropped_images/Lifei_SSUdenovo.csv')
# selected
# selected = to_show.iloc[selected['row number']]

# fig, axes = plt.subplots(1, 4, figsize=(12, 3))
fig, axes = plt.subplots(11, 8, figsize=(24, 33))
df = to_show.iloc[88::89]
rows = df.iterrows()
imgs = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")  # tight layout warning
    for page in range(1, len(df) // 88 + 2):
        for ax, (ind, row) in zip(axes.flatten(), rows):
            img_name = urllib.parse.unquote(row.Metadata_FileLocation)
            if img_name not in imgs:
                im = aicsimageio.imread(img_name).squeeze()
                # normalize
                imgs[img_name] = ((im - im.min(axis=(1, 2), keepdims=True)) / 
                    (im.max(axis=(1, 2), keepdims=True) - im.min(axis=(1, 2), keepdims=True))).copy()
            img = imgs[img_name]
            sub_img = np.zeros((radius*2, radius*2, 3))
            
            # channels = ['', 'target', 'Nop56', '18S', 'NPM1']
            x, y = int(row.Center_Y), int(row.Center_X)
            sub_img[:, :, 0] = img[1, x-radius:x+radius, y-radius:y+radius]  # red is nop56
            sub_img[:, :, 1] = img[2, x-radius:x+radius, y-radius:y+radius]  # green is 18S
            # sub_img[:, :, 2] = img[2, x-radius:x+radius, y-radius:y+radius]  # green is target
            ax.clear()
            ax.imshow(sub_img)
            ax.set_title(f'{ind} - {x} - {y} - {row.ImageNumber}')
            ax.set_axis_off()
            
        plt.tight_layout()
        plt.savefig(f'cropped_images/unseen_{page:02}.pdf')


In [None]:
selected = pd.read_csv('cropped_images/Lifei_SSUdenovo.csv')
exclude = pd.read_csv('cropped_images/subset_lifei_SSU_sofi_to_exclude.txt', header=None)
selected = selected[~selected['row number'].isin(exclude[0])]
selected = pd.concat((to_show.iloc[selected['row number']], to_show.iloc[1958:]))
selected.to_csv('cropped_images/selected_nucleoli.csv')

## Look at manually classified nucleoli

In [None]:
data = pd.read_csv('cropped_images/bkg_corr_nucleoli.csv', index_col=0)
data.groupby(['SSU', 'target', 'nucleoli_type']).GC_area.count()

In [None]:
grid = sns.relplot(data=data, x='partitioning_18Sstar', y='partitioning_GC', hue='nucleoli_type', row='nucleoli_type', col='SSU', alpha=0.2)

for ax in grid.axes.flat:
    
    ax.axvline(1.5)
    ax.axhline(1.5)
    ax.axvline(1.1)

# data['nucleoli_type'] = 'other'
# data.loc[(data['partitioning_GC'] < 1.5) & (data['partitioning_18Sstar'] > 1.5), 'nucleoli_type'] = 'de novo'
# data.loc[(data['partitioning_18Sstar'] < 1.1), 'nucleoli_type'] = 'endogenous'

In [None]:
selected

## Apply manual classes from non-background corrected data to the background corrected

In [None]:
manual = pd.read_csv('cropped_images/selected_nucleoli.csv', index_col=0)
manual['nucleoli_type'] = 'de novo'
manual.loc[1958:, 'nucleoli_type'] = 'endogenous'

def found_in_manual(row):
    in_img = manual[manual.Metadata_FileLocation == row.Metadata_FileLocation]
    dists = scipy.spatial.distance.cdist(np.expand_dims(row[['Center_X', 'Center_Y']].to_numpy(float), axis=0), in_img[['Center_X', 'Center_Y']].to_numpy())
    return dists.size > 0 and dists.min() < 1
    
data = data[data.apply(found_in_manual, axis=1)]
len(data)

In [None]:
# add in class from manual    
def manual_class(row):
    in_img = manual[manual.Metadata_FileLocation == row.Metadata_FileLocation]
    dists = scipy.spatial.distance.cdist(np.expand_dims(row[['Center_X', 'Center_Y']].to_numpy(float), axis=0), in_img[['Center_X', 'Center_Y']].to_numpy())
    return in_img.iloc[dists.argmin()]['nucleoli_type']

data['nucleoli_type'] = data.apply(manual_class, axis=1)

In [None]:
data.to_csv('cropped_images/bkg_corr_nucleoli.csv')

## Include cropped data

In [None]:
crop_data, crop_rdf = read_data('/home/tcomi/Desktop/cp_outputs/SSU_test_raw', regex=r'SSU_IF/cropped/(?P<image_class>denoised|endogenous).*/[A-G]\d+_+(?P<SSU>WTSSU|mutSSU).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10).*nd2')
crop_data.loc[crop_data.image_class == 'denoised', 'image_class'] = 'de novo'

In [None]:
crop_data.groupby(['target', 'image_class', 'SSU']).Count_GC.count()
(crop_data.image_class != 'de novo').sum()
# crop_data.to_csv('cropped_ssu.csv')

In [None]:
# Use size distribution to estimate a high pass threshold

sns.histplot(x='GC_area', data=crop_data[crop_data.GC_area < 1000], hue='image_class')

In [None]:
# ax = sns.scatterplot(data=crop_data, x='partitioning_18Sstar', y='GC_area', hue='image_class')
ax = sns.scatterplot(data=crop_data, x='mean_Intensity_18Sstar_GC', y='GC_area', hue='image_class')

ax.axvline(0.0025)
ax.axvline(0.004)
ax.axhline(3000)
ax.axhline(300)

In [None]:
# average raw values based on target and ssu
merged_dat = crop_rdf.merge(crop_data[['ImageNumber', 'NucleolusNumber', 'SSU', 'target', 'image_class']], left_on=['ImageNumber', 'ObjectNumber'], right_on=['ImageNumber', 'NucleolusNumber'])
groups = ['SSU', 'target', 'channel', 'radius', 'image_class']
channels = ['', 'target', 'Nop56', '18S', 'NPM1']
rdf_data = []
for name, dat in merged_dat.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         
rdf_data

In [None]:
target_dat = rdf_data[
    (rdf_data.SSU == "WTSSU") &
    (rdf_data.image_class == "endogenous") &
    (rdf_data.channel == "target")
    ].copy()
target_dat['normalized_intensity'] = target_dat.groupby('target').intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

npm_dat = rdf_data[
        (rdf_data.SSU == "WTSSU") &
        (rdf_data.image_class == "endogenous") &
        (rdf_data.channel == "Nop56")
    ].copy()
npm_dat['normalized_intensity'] = npm_dat.groupby('target').intensity.transform(lambda x: (x - x.min()) / (x.max() - x.min()), )

In [None]:
# all targets, only de novo, only WT, normalized
ax = sns.lineplot(
    data=target_dat,
    x='radius',
    y='normalized_intensity',
    hue='target',
)
sns.lineplot(
    data=npm_dat,
    x='radius',
    y='normalized_intensity',
    linestyle='--',
    c='k',
    ax=ax
)
             

In [None]:
crop_data.columns

In [None]:
lbl = 'GC_area'
sns.displot(data=crop_data, hue='image_class', col='SSU', kind='ecdf', x=lbl, facet_kws=dict(sharex=False))
plt.savefig(f"crop_SSU_{lbl}_dist.pdf")

## Additional targets

In [None]:
data = []
rdf = []
d, r = read_data('SSU_IF_more_targets/SSU_more_targets/outputs/', regex=r'/[A-G]\d+_+(?P<SSU>[^_]+).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10|NAT10|ESF1|Nat10).*nd2')
data.append(d.assign(nucleoli_type='de novo'))
rdf.append(r)
d, r = read_data('SSU_IF_more_targets/SSU_more_targets_endogenous/outputs/', regex=r'/(?:best_)?[A-G]\d+_+(?P<SSU>[^_]+).*_(?P<target>RPS6|surf6|RRP1|DDX21|nucleolin|EXOSC10|NAT10|ESF1|Nat10).*nd2')
data.append(d.assign(nucleoli_type='endogenous'))
rdf.append(r)
data = pd.concat(data, ignore_index=True)
rdf = pd.concat(rdf, ignore_index=True)
# fix typo, make consistent with other
data['target'] = data.target.replace('Nat10', 'NAT10')
data['SSU'] = data.SSU.replace('M437', 'SSUWT')
data['SSU'] = data.SSU.replace('M438', 'SSUMut')
data.to_csv('cropped_images/more_targets_nucleoli_renamed.csv')

In [None]:
sub_dat = data[data.Metadata_FileLocation.str.contains(
    # r'(F6_M438_NPM1647_Fib568_18S488_ESF1405_z001_crop3)|'
    r'(F6_M438_NPM1647_Fib568_18S488_ESF1405_z003_crop1%)'
    # r'(E6_M437_NPM1647_Fib568_18S488_ESF1405_zstack_crop3)|'
    # r'(E6_M437_NPM1647_Fib568_18S488_ESF1405_zstack_crop1)'
)]
print(
sub_dat[[
    'SSU',
    'mean_Intensity_GC_GC',
    'mean_Intensity_target_GC',
    
]])
sub_dat.Metadata_FileLocation.unique()
sub_dat.filter(like='mean_Intensity').T

In [None]:
sub_dat['Metadata_FileLocation'].unique()

In [None]:
raw = pd.read_csv('SSU_IF/SSU_more_targets/outputs/InitialNucleoli.csv')
raw.loc[raw.ImageNumber.isin([76, 78, 104, 109]), [c for c in raw.columns if c.startswith('Intensity_MeanIntensity_')]].T

In [None]:
sns.scatterplot(data=data, x='mean_Intensity_GC_GC', y='mean_Intensity_target_GC', hue='SSU', style='nucleoli_type')

In [None]:
# retain only de novo nucleoli
to_keep = data.copy()
de_novo = rdf.merge(to_keep[['ImageNumber', 'NucleolusNumber', 'SSU', 'target', 'nucleoli_type']], left_on=['ImageNumber', 'ObjectNumber'], right_on=['ImageNumber', 'NucleolusNumber'])

In [None]:
# average raw values based on target and ssu
groups = ['SSU', 'target', 'channel', 'radius', 'nucleoli_type']
channels = ['', 'NPM1', 'Fib568', '18S', 'target']
rdf_data = []
for name, dat in de_novo.groupby(groups):
    rdf_data.append(dict(
        zip(groups, name),
        intensity=((dat['intensity'] * dat['counts']).fillna(0).sum()) / dat['counts'].sum(),
        channel=channels[name[2]]
    ))
rdf_data = pd.DataFrame(rdf_data)         

rdf_data.to_csv('cropped_images/more_targets_rdf_renamed.csv')

## Cropped Intensity

In [None]:
data, _ = read_data('SSU_IF/cropped_intens/outputs/', regex=r'/[A-G]\d+_+(?P<plasmid>[^_]+).*_(?P<star>28S|18S)488.*nd2')
data

In [None]:
sns.relplot(data=data, x='mean_Intensity_GC_GC', y='mean_Intensity_DFC_GC', hue='star', col='plasmid')

In [None]:
data.to_csv('GC_DFC_quantification.csv')

In [None]:
data.T[0]

In [None]:
data.drop(
    columns=[c for c in data.columns if 'target' in c]).rename(
    columns={c: c.replace('18S', '') for c in data.columns if '18Sstar' in c}).to_csv('GC_DFC_quantification.csv')

## Rim score

In [None]:
def add_rim(result, directory, common, bins=1, total=10):
    cols = open(directory / 'InitialNucleoli.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('RadialDistribution_FracAtD')
           ]
    distributions = pd.read_csv(
        directory / 'InitialNucleoli.csv', 
        usecols=common + cols,
    )

    bins = [i for i in range(total, total-bins, -1)]
    relative_areas = distributions[[f'RadialDistribution_FracAtD_InitialNucleoliObjectImage_{bin}of{total}' for bin in bins]].sum(axis=1)
    distributions['dfc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_DFC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
    distributions['fc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_FC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
        
    map_cols = {
        'dfc_rim_enrichment': 'dfc_rim_enrichment',
        'fc_rim_enrichment': 'fc_rim_enrichment',
    }
    result = merge_result(result, distributions, map_cols)

    return result

def read_rim_data(directory, regex=None):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should correspond to a single cell
    common = ['ImageNumber', 'ObjectNumber', 'Parent_DilatedNucleoli']

    result = build_initial_data(directory, common, regex)

    result = add_npm_partitioning(result, directory, common, other_name='FC')
    result = add_rim(result, directory, common, bins=4, total=20)
    
    return result.drop(columns='Parent_DilatedNucleoli')

data = read_rim_data('/scratch/gpfs/tcomi/cp_morphology_240617/SSU_rim/ssu_240801/outputs', 
                     regex=r'/SSU_rim/(?P<SSU>SSUWT|SSUmut)/.*nd2')
data[data.isna().any(axis=1)].Metadata_FileLocation.unique()
data

In [None]:
sns.scatterplot(data=data, hue='SSU', x='fc_rim_enrichment', y='dfc_rim_enrichment')

In [None]:
data.to_csv('ssu_rim_240801.csv')