In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
import sklearn.decomposition
import scipy

from pathlib import Path
import re

In [None]:
def merge_result(result_df, df, map_cols, merge_on='Parent_Cell'):
    return result_df.merge(
        df[['ImageNumber', merge_on] + list(map_cols.keys())].rename(columns=map_cols),
        on=['ImageNumber', merge_on],
    )

def build_initial_data(directory, common, regex):
    gc = pd.read_csv(
        directory / 'Cell.csv', 
        usecols=common,
    )
    
    # read image metadata, mainly file location
    images = pd.read_csv(directory / 'Image.csv', usecols=['Metadata_FileLocation', 'ImageNumber', 'Metadata_Series'])
    
    # add in regex of filename (time, treatment, etc)
    if regex:
        images = images.join(images['Metadata_FileLocation'].str.extract(regex))
    
    # start building the final result, with image number and merged_gc number (renamed to cell number)
    result = gc[['ImageNumber', 'ObjectNumber']].drop_duplicates()
    
    # add image file location and regex info
    result = result.merge(images, on='ImageNumber', how="left")
    result['Parent_Cell'] = result['ObjectNumber']

    return result

def add_cytoplasm(result, directory, common):
    cyto = pd.read_csv(
        directory / 'Cytoplasm.csv',
        usecols=common + ['Intensity_MeanIntensity_plasmid_star', 'Intensity_MeanIntensity_plasmid_28S', 
                          'Parent_Cell',
                         ])
    map_cols = {
        'Intensity_MeanIntensity_plasmid_star': 'mean_star_intensity_cyto',
        'Intensity_MeanIntensity_plasmid_28S': 'mean_28S_intensity_cyto',
    }
    result = merge_result(result, cyto, map_cols)

    return result

def add_nucleus(result, directory, common):
    nuc = pd.read_csv(
        directory / 'Nuclei.csv',
        usecols=common + ['Intensity_MeanIntensity_plasmid_star', 'Intensity_MeanIntensity_plasmid_28S', 
                          'Parent_Cell',
                         ])
    map_cols = {
        'Intensity_MeanIntensity_plasmid_star': 'mean_star_intensity_nucleus',
        'Intensity_MeanIntensity_plasmid_28S': 'mean_28S_intensity_nucleus',
    }
    result = merge_result(result, nuc, map_cols)

    return result

def add_cell(result, directory, common):
    nuc = pd.read_csv(
        directory / 'Cell.csv',
        usecols=common + ['Intensity_MeanIntensity_plasmid_star', 'Intensity_MeanIntensity_plasmid_28S',
                         ])
    map_cols = {
        'Intensity_MeanIntensity_plasmid_star': 'mean_star_intensity_cell',
        'Intensity_MeanIntensity_plasmid_28S': 'mean_28S_intensity_cell',
    }
    result = merge_result(result, nuc, map_cols, merge_on='ObjectNumber')

    return result
    
def read_data(directory, regex=None, bins=4):
    directory = Path(directory)
    # image and object number are uniuqe identifiers.  Area is used a lot and the parent_mergedGC should corresopnd to a single cell
    common = ['ImageNumber', 'ObjectNumber']

    result = build_initial_data(directory, common, regex)
    
    result = add_cytoplasm(result, directory, common)
    
    result = add_nucleus(result, directory, common)
    
    result = add_cell(result, directory, common)
    
    return result

data = read_data('plasmid_cyto/plasmid/outputs/', 
                 r'/cytoplas?mic(?P<star>18S|28S)/[A-G]\d+_(?P<treatment>[^_]+).*nd2$', 
                 )
data.loc[data.isna().any(axis=1), 'Metadata_FileLocation'].unique()
data

In [None]:
data.to_csv('cytoplasmic_star_240730.csv')

In [None]:
sns.relplot(data = data, col='star', row='treatment', x='mean_star_intensity_cyto', y='mean_star_intensity_cell')

In [None]:
sns.displot(data = data, col='star', hue='treatment', x='mean_star_intensity_cyto', kind='ecdf')