In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
import sklearn.decomposition
import scipy

from pathlib import Path
import re

In [None]:
def merge_result(result_df, df, map_cols, merge_on='ObjectNumber'):
    return result_df.merge(
        df[['ImageNumber', merge_on] + list(map_cols.keys())].rename(columns=map_cols),
        on=['ImageNumber', merge_on],
    )

def build_initial_data(directory, common, regex):
    gc = pd.read_csv(
        directory / 'GC.csv', 
        usecols=common,
    )
    
    # read image metadata, mainly file location
    images = pd.read_csv(directory / 'Image.csv', usecols=['Metadata_FileLocation', 'ImageNumber', 'Metadata_Series'])
    
    # add in regex of filename (time, treatment, etc)
    if regex:
        images = images.join(images['Metadata_FileLocation'].str.extract(regex))
    
    # start building the final result, with image number and merged_gc number (renamed to cell number)
    result = gc[['ImageNumber', 'ObjectNumber']].drop_duplicates()
    
    # add image file location and regex info
    result = result.merge(images, on='ImageNumber', how="left")

    return result

def add_initial_gc(result, directory, common):
    # read GC, plus intensities for GC and Probe and eccentricity
    gc = pd.read_csv(
        directory / 'GC.csv', 
        usecols=common + ['Intensity_MeanIntensity_FiveETS', 
                          'Intensity_MeanIntensity_plasmidStar', 
                          'Intensity_MaxIntensity_FiveETS', 
                          'Intensity_MaxIntensity_plasmidStar', 
                          'Intensity_UpperQuartileIntensity_FiveETS', 
                          'Intensity_UpperQuartileIntensity_plasmidStar', 
                          'AreaShape_Area'
                         ])
    map_cols = {
        'Intensity_MeanIntensity_FiveETS': 'mean_5ETS_intensity_GC',
        'Intensity_MeanIntensity_plasmidStar': 'mean_plasmidStar_intensity_GC',
        'Intensity_MaxIntensity_FiveETS': 'max_5ETS_intensity_GC',
        'Intensity_MaxIntensity_plasmidStar': 'max_plasmidStar_intensity_GC',
        'Intensity_UpperQuartileIntensity_FiveETS': 'upperQuartile_5ETS_intensity_GC',
        'Intensity_UpperQuartileIntensity_plasmidStar': 'upperQuartile_plasmidStar_intensity_GC',
        'AreaShape_Area': 'Area_GC',
    }
    result = merge_result(result, gc, map_cols)

    return result


def add_rim(result, directory, common, bins=1, total=10):
    cols = open(directory / 'GC.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('RadialDistribution_FracAtD')
           ]
    distributions = pd.read_csv(
        directory / 'GC.csv', 
        usecols=common + cols,
    )

    bins = [i for i in range(total, total-bins, -1)]
    relative_areas = distributions[[f'RadialDistribution_FracAtD_GCObjectImage_{bin}of{total}' for bin in bins]].sum(axis=1)
    distributions['dfc_rim_enrichment'] = distributions[[f'RadialDistribution_FracAtD_DFC_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
    distributions['dfc_rim_enrichment_raw'] = distributions[[f'RadialDistribution_FracAtD_DFC_raw_{bin}of{total}' for bin in bins]].sum(axis=1) / relative_areas
        
    map_cols = {
        'dfc_rim_enrichment': 'dfc_rim_enrichment',
        'dfc_rim_enrichment_raw': 'dfc_rim_enrichment_raw',
    }
    result = merge_result(result, distributions, map_cols)

    return result

def add_correlation(result, directory, common):
    # correlations over combined regions
    cols = open(directory / 'DilatedGC.csv').readline().split(',')
    cols = [c for c in cols 
            if c.startswith('Correlation_Correlation')
            or c.startswith('Correlation_Overlap')
           ]
    corr = pd.read_csv(
        directory / 'DilatedGC.csv', 
        usecols=common + cols,
    )
    
    map_cols = {c: c[12:] for c in cols}
    result = merge_result(result, corr, map_cols)
    
    return result
    
def read_data(directory, regex=None, bins=4):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should corresopnd to a single cell
    common = ['ImageNumber', 'ObjectNumber']

    result = build_initial_data(directory, common, regex)
    
    result = add_initial_gc(result, directory, common)
    
    result = add_rim(result, directory, common, bins, total=20)

    result = add_correlation(result, directory, common)
    
    return result
    
data = read_data('SSU_cropped_overexpressed/SSU_240816/outputs/', 
                 r'/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 )
data.loc[data.isna().any(axis=1), 'Metadata_FileLocation'].unique()


In [None]:
data = pd.concat([
    read_data('SSU_cropped_IF/SSU_IF/outputs', 
                 r'/(?P<nucleoli_type>Denovo|Endogenous)/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)plasmid.*nd2$', 
                 ).assign(exp='IF'),
    read_data('SSU_cropped_IF/SSU_240730/outputs', 
                 r'/(?P<nucleoli_type>Denovo|Endogenous)/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)plasmid.*nd2$', 
                 ).assign(exp='IF_240730'),
    read_data('SSU_cropped_IF/SSU_240815/outputs/', 
                 r'/[A-G]\d+(?:zstack)?_(?P<treatment>[^_]+).*_(?P<star>18S|28S)plasmid.*nd2$', 
                 ).assign(exp='IF_240815', nucleoli_type='Denovo'),
    read_data('SSU_cropped_IF/SSU_240816/outputs/', 
                 r'/[A-G]\d+(?:zstack)?_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 ).assign(exp='IF_240816', nucleoli_type='Denovo'),
    read_data('SSU_cropped_overexpressed/SSU_overexpressed/outputs', 
                 r'/(?P<nucleoli_type>Denovo|Endogenous)/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 ).assign(exp='overexpressed'),
    read_data('SSU_cropped_overexpressed/SSU_240730/outputs', 
                 r'/(?P<nucleoli_type>Denovo|Endogenous)/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 ).assign(exp='overexpressed_240730'),
    read_data('SSU_cropped_overexpressed/SSU_240815/outputs/', 
                 r'/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 ).assign(exp='overexpressed_240815', nucleoli_type='Denovo'),
], ignore_index=True)
len(data)

In [None]:
data.to_csv('rDNA_cropped_240816.csv')
data.groupby(['exp', 'treatment', 'nucleoli_type', 'star']).ObjectNumber.count()

In [None]:
data = read_data('SSU_cropped_overexpressed/SSU_240816/outputs/', 
                 r'/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                ).rename(columns=lambda x: re.sub('5ETS', 'endo28S', x))
data.to_csv('20240504_28Sendo_OE.csv')
data.columns

In [None]:
data

In [None]:
sns.relplot(data=data, x='mean_5ETS_intensity_GC', y='mean_plasmidStar_intensity_GC',
            hue='nucleoli_type', col='star', style='treatment', row='exp',
           )

In [None]:
sns.relplot(data=data[~data.exp.str.startswith('IF')], x='mean_plasmidStar_intensity_GC', y='Area_GC', hue='nucleoli_type', 
            col='star', style='treatment')#, facet_kws=dict(sharex='row'))

In [None]:
sns.relplot(data=data, x='upperQuartile_plasmidStar_intensity_GC', y='Area_GC', 
            style='treatment', facet_kws=dict(sharex=False))

In [None]:
sns.displot(data=data[data.nucleoli_type == 'Denovo'], row='nucleoli_type',
            x='dfc_rim_enrichment', hue='treatment', kind='ecdf')

In [None]:
sns.displot(data=data[data.nucleoli_type == 'Denovo'], row='nucleoli_type',
            x='dfc_rim_enrichment_raw', hue='treatment', kind='ecdf')

In [None]:
ax = sns.scatterplot(data=data, x='dfc_rim_enrichment', y='dfc_rim_enrichment_raw', hue='nucleoli_type')
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

# Uncropped images

In [None]:
data = pd.concat([
    read_data('SSU_cropped_IF/SSU_240805/outputs', 
                 r'/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)plasmid.*nd2$', 
                 ).assign(exp='IF_240805'),
    read_data('SSU_cropped_overexpressed/SSU_240805/outputs', 
                 r'/[A-G]\d+_(?P<treatment>[^_]+).*_(?P<star>18S|28S)488.*nd2$', 
                 ).assign(exp='overexpressed_240805'),
]).reset_index(drop=True)

data.loc[data.isna().any(axis=1), 'Metadata_FileLocation'].unique()
data.loc[data[['treatment', 'star']].isna().any(axis=1), 'Metadata_FileLocation'].unique()

# remove zstacks
data = data[~data.treatment.isna()]
data.to_csv('SSU_uncropped_240805.csv')

In [None]:
data.columns

In [None]:
sns.relplot(data=data, x='mean_5ETS_intensity_GC', y='mean_plasmidStar_intensity_GC', 
            col='exp', hue='treatment', facet_kws=dict(sharey='col'),
            row='treatment',
           )

In [None]:
sns.relplot(data=data[(data.exp=='overexpressed_240805') & (data.treatment == 'M433')],
            x='mean_5ETS_intensity_GC', y='mean_plasmidStar_intensity_GC', 
            row='Metadata_FileLocation',
           )