In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
from sklearn import preprocessing
import os
from os import listdir
from os.path import isfile, join

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))



Assemble MasterDataFrame from spindle3D, ilastik and 3D intensity measurements data
for all katanin RNAi datasets

In [2]:
root_dir = "/Volumes/TOB_WD2/Image_Analysis/RNAi_Katanin/Analysis" + "/"

# parse Plugin Measurements

input_dir1 = root_dir + "Spindle3D_output/20210222/Spindle3D_out_077/Correct"
input_dir2 = root_dir + "Spindle3D_output/20210419/Spindle3D_out_077/Correct"
input_dir3 = root_dir + "Spindle3D_output/20210726/Spindle3D_out_080/Correct"

spindle_input_folders = [input_dir1, input_dir2, input_dir3]

def concat_spindle3d(folderlist):
    dataframes = []
    for folder in folderlist:    
        for root, dirs, files in os.walk(folder):
            for file in files:
                filepath = root + os.sep + file  
                if filepath.endswith(".txt"):
                    single_df = pd.read_csv(filepath, sep=" ", delimiter="\t")
                    dataframes.append(single_df)  
                else:
                    pass
    df = pd.concat(dataframes)
    return df

spindle_df = concat_spindle3d(spindle_input_folders)

print("Finished parsing data.")

def get_nakedname(path):
    # parse cell ids from file name in spindle3D table
    base = os.path.basename(str(path))
    naked = os.path.splitext(base)[0]
    return naked

# modify spindle 3d dataframe
spindle_df["Cell_ID"] = spindle_df.Path_InputImage.apply(get_nakedname)
spindle_df["Experiment"] = spindle_df.Cell_ID.str.split("_").str.get(0)

subselection_spindle = [
    "Cell_ID",
    "Chromatin_Volume_um3",
    "MetaphasePlate_Length_um",
    "MetaphasePlate_Width_um",
    "Spindle_Angle_Degrees", 
    "Spindle_Aspect_Ratio",  
    "Spindle_Length_um", 
    "Spindle_Volume_um3", 
    "Spindle_Width_Avg_um", 
    "Tubulin_Spindle_Average_Intensity"
]
spindle_df = spindle_df[subselection_spindle]

spindle_df.head(1)

Finished parsing data.


Unnamed: 0,Cell_ID,Chromatin_Volume_um3,MetaphasePlate_Length_um,MetaphasePlate_Width_um,Spindle_Angle_Degrees,Spindle_Aspect_Ratio,Spindle_Length_um,Spindle_Volume_um3,Spindle_Width_Avg_um,Tubulin_Spindle_Average_Intensity
0,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_001-1,402.265625,9.5,4.75,4.583528,1.279153,9.711462,273.421875,7.592105,1204.261158


In [4]:
# parse katanin and gamma Tub
# 3D Intensity measurements

intensity_dir1 = root_dir + "3D_Intensity_measurements/20210222"
intensity_dir2 = root_dir + "3D_Intensity_measurements/20210419"
intensity_dir3 = root_dir + "3D_Intensity_measurements/20210726"

correct_QC_dir = root_dir + "Cell_volumes/4Ch/Measurements_4Ch_all/Correct"

In [5]:
def parse_3d_intensities(folder, QC_folder):
    
    QC_filelist = [f for f in os.listdir(QC_folder) if not f.startswith('.')]
    QC_filelist_cell_IDs = [file.rsplit("_",2)[0] for file in QC_filelist]
    
    intensity_filelist = [f for f in os.listdir(folder) if not f.startswith('.')]
    intensity_filelist.sort()

    BG_gTub_dataframes = []
    BG_Katanin_dataframes = []
    Spindle_gTub_dataframes = []
    Spindle_Katanin_dataframes = []
    Pole_gTub_dataframes = []
    Pole_Katanin_dataframes = []
    WholeCell_gTub_dataframes = []
    WholeCell_Katanin_dataframes = []

    for file in intensity_filelist:
            filepath = folder + os.sep + file
            file_naked, ext = os.path.splitext(file)
            if ext == ".csv":
                staining = file_naked.split("_")[5]
                if filepath.endswith("BG_gTub.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    single_C1_df = single_C1_df.rename(columns={'Average': 'Average_BG_gTub'})
                    single_C1_df = single_C1_df.head(1)
                    BG_gTub_dataframes.append(single_C1_df)

                elif filepath.endswith("BG_Katanin.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    if staining == "Ka1568":
                        single_C1_df = single_C1_df.rename(columns={'Average': 'Average_BG_Katanin_p60'})
                    else:
                        single_C1_df = single_C1_df.rename(columns={'Average': 'Average_BG_Katanin_p80'})
                    single_C1_df = single_C1_df.head(1)
                    BG_Katanin_dataframes.append(single_C1_df)

                elif filepath.endswith("Spindle_gTub.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Spindle_gTub'})
                    single_C1_df = single_C1_df.head(1)
                    Spindle_gTub_dataframes.append(single_C1_df)

                elif filepath.endswith("Spindle_Katanin.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    if staining == "Ka1568":
                        single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Spindle_Katanin_p60'})
                    else:
                        single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Spindle_Katanin_p80'})
                    single_C1_df = single_C1_df.head(1)
                    Spindle_Katanin_dataframes.append(single_C1_df)
                    
                elif filepath.endswith("Pole_gTub.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID", "IntegratedDensity"]]
                    cell_id = single_C1_df.loc[0, "Cell_ID"]
                    single_C1_df["gTub_Volume"] = single_C1_df.IntegratedDensity / (single_C1_df.Average * 64)
                    single_C1_df = single_C1_df.sort_values(by="gTub_Volume", ascending=False).head(2)
                    single_C1_df["Average_Pole_gTub"] = single_C1_df.Average.mean()
                    single_C1_df["Cell_ID"] = cell_id
                    single_C1_df = single_C1_df.drop(['Average', 'gTub_Volume'], axis=1)
                    single_C1_df = single_C1_df.head(1)
                    Pole_gTub_dataframes.append(single_C1_df)
                    
                elif filepath.endswith("Pole_Katanin.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID", "IntegratedDensity"]]
                    cell_id = single_C1_df.loc[0, "Cell_ID"]
                    single_C1_df["gTub_Volume"] = single_C1_df.IntegratedDensity / (single_C1_df.Average * 64)
                    single_C1_df = single_C1_df.sort_values(by="gTub_Volume", ascending=False).head(2)
                    single_C1_df["Average_Pole_Katanin"] = single_C1_df.Average.mean()
                    single_C1_df["Cell_ID"] = cell_id
                    single_C1_df = single_C1_df.drop(['Average', 'IntegratedDensity'], axis=1)                   
                    if staining == "Ka1568":
                        single_C1_df = single_C1_df.rename(columns={'Average_Pole_Katanin': 'Average_Pole_Katanin_p60'})
                    else:
                        single_C1_df = single_C1_df.rename(columns={'Average_Pole_Katanin': 'Average_Pole_Katanin_p80'})
                    single_C1_df = single_C1_df.head(1)
                    Pole_Katanin_dataframes.append(single_C1_df)
                    
                elif filepath.endswith("total_cell_gTub.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    cell_id = single_C1_df.loc[0, "Cell_ID"]
                    if cell_id in QC_filelist_cell_IDs:
                        single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Total_gTub'})
                        WholeCell_gTub_dataframes.append(single_C1_df)
                    else:
                        print(cell_id + " not in QC list.")
                
                elif filepath.endswith("total_cell_katanin.csv"):
                    single_C1_df = pd.read_csv(filepath)
                    single_C1_df = single_C1_df[["Average", "Cell_ID"]]
                    cell_id = single_C1_df.loc[0, "Cell_ID"]
                    if cell_id in QC_filelist_cell_IDs:
                        if staining == "Ka1568":
                            single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Total_Katanin_p60'})
                        else:
                            single_C1_df = single_C1_df.rename(columns={'Average': 'Average_Total_Katanin_p80'})
                        WholeCell_Katanin_dataframes.append(single_C1_df)
                    else:
                        print(cell_id + " not in QC list.")
            else:
                print("{} not suitable.".format(file))

    print("Populated lists with single Dataframes")

    BG_gTub_dataframe = pd.concat(BG_gTub_dataframes)
    BG_Katanin_dataframe = pd.concat(BG_Katanin_dataframes)
    Spindle_gTub_dataframe = pd.concat(Spindle_gTub_dataframes)
    Spindle_Katanin_dataframe = pd.concat(Spindle_Katanin_dataframes)
    Pole_gTub_dataframe = pd.concat(Pole_gTub_dataframes)
    Pole_Katanin_dataframe = pd.concat(Pole_Katanin_dataframes)
    WholeCell_gTub_dataframe = pd.concat(WholeCell_gTub_dataframes)
    WholeCell_Katanin_dataframe = pd.concat(WholeCell_Katanin_dataframes)
    print("Concatenated dataframes from lists")
    intensity_df = BG_gTub_dataframe.merge(
        BG_Katanin_dataframe, on = 'Cell_ID'
            ).merge(
        Spindle_gTub_dataframe, on = 'Cell_ID'
            ).merge(
        Spindle_Katanin_dataframe, on = 'Cell_ID'
            ).merge(
        Pole_gTub_dataframe, on = 'Cell_ID'
            ).merge(
        Pole_Katanin_dataframe, on = 'Cell_ID'
            ).merge(
        WholeCell_gTub_dataframe, on = 'Cell_ID', how = "outer"
            ).merge(
        WholeCell_Katanin_dataframe, on = 'Cell_ID', how = "outer"
        )
    print("Merged dataframes")
    return intensity_df

intensity_df_1 = parse_3d_intensities(intensity_dir1, correct_QC_dir)
intensity_df_2 = parse_3d_intensities(intensity_dir2, correct_QC_dir)
intensity_df_3 = parse_3d_intensities(intensity_dir3, correct_QC_dir)

intensity_df = pd.concat([intensity_df_1, intensity_df_2, intensity_df_3])

20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-3 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-3 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_004-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_004-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_006-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_006-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_00

20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_055-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_055-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_055-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_055-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_058-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_058-2 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_058-3 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_058-3 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_059-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Katnb1_059-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_009-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_009-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_010-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_010-1 not in QC list.
20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_01

20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_018-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_018-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_019-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_019-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_021-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_021-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_023-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_023-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_023-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_023-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_024-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_024-2 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_025-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_025-1 not in QC list.
20210419_R1E309_RNAi_04_gTub647_Kb1568_Katnb1_02

20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_070-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_070-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_070-2 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_070-2 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_071-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_071-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-3 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-3 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-4 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-4 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_074-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_074-1 not in QC list.
20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_07

In [6]:
intensity_df.head()

Unnamed: 0,Average_BG_gTub,Cell_ID,Average_BG_Katanin_p80,Average_Spindle_gTub,Average_Spindle_Katanin_p80,IntegratedDensity,Average_Pole_gTub,gTub_Volume,Average_Pole_Katanin_p80,Average_Total_gTub,Average_Total_Katanin_p80,Average_BG_Katanin_p60,Average_Spindle_Katanin_p60,Average_Pole_Katanin_p60
0,840.172,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_001-1,693.265,1278.886,1069.654,2027103.0,3183.078,9.515625,2844.8185,934.624,835.425,,,
1,954.032,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1,822.261,1368.057,1148.076,1834756.0,3521.227,7.890626,2897.984,,,,,
2,954.365,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-2,792.816,1495.804,1208.756,2035208.0,3535.039,9.156251,3681.7625,,,,,
3,951.253,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-3,788.776,1414.025,1287.634,2164636.0,4037.6555,8.171874,4162.378,,,,,
4,809.665,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-1,779.812,1279.435,1154.185,978519.0,2968.6655,5.109375,2434.6245,,,,,


In [7]:
# parse katanin_offsets

kat_offset_dir1 = root_dir + "3D_Intensity_measurements/Katanin_Offsets_20210222"
kat_offset_dir2 = root_dir + "3D_Intensity_measurements/Katanin_Offsets_20210419"
kat_offset_dir3 = root_dir + "3D_Intensity_measurements/Katanin_Offsets_20210726"
gtub_offset_dir1 = root_dir + "3D_Intensity_measurements/gTub_Offsets_20210222"
gtub_offset_dir2 = root_dir + "3D_Intensity_measurements/gTub_Offsets_20210419"
gtub_offset_dir3 = root_dir + "3D_Intensity_measurements/gTub_Offsets_20210726"

def staining_offsets(folder):
    intensity_filelist = [f for f in os.listdir(folder) if not f.startswith('.')]
    intensity_filelist.sort()
    offset_dataframes = []
    for file in intensity_filelist:
        filepath = folder + os.sep + file
        if filepath.endswith("Katanin.csv"):
            single_df = pd.read_csv(filepath)
            single_df = single_df.rename(
                columns={
                    "CameraOffset_Average": "CameraOffset_Katanin_Average"
                }
            )
            offset_dataframes.append(single_df)    
        elif filepath.endswith("gammaTub.csv"):
            single_df = pd.read_csv(filepath)
            single_df = single_df.rename(
                columns={
                    "CameraOffset_Average": "CameraOffset_gTub_Average"
                }
            )
            offset_dataframes.append(single_df) 
        else:
            pass
    offset_df = pd.concat(offset_dataframes)
    return offset_df

katanin_offsets_1 = staining_offsets(kat_offset_dir1)
katanin_offsets_2 = staining_offsets(kat_offset_dir2)
katanin_offsets_3 = staining_offsets(kat_offset_dir3)
katanin_offsets = pd.concat([katanin_offsets_1, katanin_offsets_2, katanin_offsets_3])

gtub_offsets_1 = staining_offsets(gtub_offset_dir1)
gtub_offsets_2 = staining_offsets(gtub_offset_dir2)
gtub_offsets_3 = staining_offsets(gtub_offset_dir3)
gtub_offsets = pd.concat([gtub_offsets_1, gtub_offsets_2, gtub_offsets_3])

staining_offsets_df = katanin_offsets.merge(gtub_offsets, on = "Cell_ID")

intensity_df = intensity_df.merge(staining_offsets_df, on = "Cell_ID")

In [8]:
intensity_df["Average_Spindle_Katanin_p80"] = intensity_df.Average_Spindle_Katanin_p80 - intensity_df.CameraOffset_Katanin_Average
intensity_df["Average_BG_Katanin_p80"] = intensity_df.Average_BG_Katanin_p80 - intensity_df.CameraOffset_Katanin_Average
intensity_df["Average_Pole_Katanin_p80"] = intensity_df.Average_Pole_Katanin_p80 - intensity_df.CameraOffset_Katanin_Average
intensity_df["Average_Total_Katanin_p80"] = intensity_df.Average_Total_Katanin_p80 - intensity_df.CameraOffset_Katanin_Average

intensity_df["Average_Spindle_gTub"] = intensity_df.Average_Spindle_gTub - intensity_df.CameraOffset_gTub_Average
intensity_df["Average_BG_gTub"] = intensity_df.Average_BG_gTub - intensity_df.CameraOffset_gTub_Average
intensity_df["Average_Pole_gTub"] = intensity_df.Average_Pole_gTub - intensity_df.CameraOffset_gTub_Average
intensity_df["Average_Total_gTub"] = intensity_df.Average_Total_gTub - intensity_df.CameraOffset_gTub_Average

intensity_df["3D_density_ratio_Katanin_p80"] = intensity_df.Average_Spindle_Katanin_p80 / intensity_df.Average_BG_Katanin_p80
intensity_df["3D_density_ratio_gamma_Tubulin"] = intensity_df.Average_Spindle_gTub / intensity_df.Average_BG_gTub

intensity_df["3D_density_Poles_Katanin_p80"] = intensity_df.Average_Pole_Katanin_p80 / intensity_df.Average_BG_Katanin_p80
intensity_df["3D_density_Poles_gamma_Tubulin"] = intensity_df.Average_Pole_gTub / intensity_df.Average_BG_gTub

intensity_df["Norm_Poles_Katanin_p80"] = intensity_df.Average_Pole_Katanin_p80 / intensity_df.Average_Total_Katanin_p80
intensity_df["Norm_Poles_gamma_Tubulin"] = intensity_df.Average_Pole_gTub / intensity_df.Average_Total_gTub

intensity_df.head(2)

Unnamed: 0,Average_BG_gTub,Cell_ID,Average_BG_Katanin_p80,Average_Spindle_gTub,Average_Spindle_Katanin_p80,IntegratedDensity,Average_Pole_gTub,gTub_Volume,Average_Pole_Katanin_p80,Average_Total_gTub,...,_x,CameraOffset_Katanin_Average,_y,CameraOffset_gTub_Average,3D_density_ratio_Katanin_p80,3D_density_ratio_gamma_Tubulin,3D_density_Poles_Katanin_p80,3D_density_Poles_gamma_Tubulin,Norm_Poles_Katanin_p80,Norm_Poles_gamma_Tubulin
0,234.724,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_001-1,120.414,673.438,496.803,2027103.0,2577.63,9.515625,2271.9675,329.176,...,1,572.851,1,605.448,4.125791,2.869063,18.867968,10.981536,8.652675,7.830553
1,200.523,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1,127.269,614.548,453.084,1834756.0,2767.718,7.890626,2202.992,,...,1,694.992,1,753.509,3.56005,3.064726,17.30973,13.802496,,


In [9]:
df = spindle_df.merge(intensity_df, on = "Cell_ID")
df.head(2)

Unnamed: 0,Cell_ID,Chromatin_Volume_um3,MetaphasePlate_Length_um,MetaphasePlate_Width_um,Spindle_Angle_Degrees,Spindle_Aspect_Ratio,Spindle_Length_um,Spindle_Volume_um3,Spindle_Width_Avg_um,Tubulin_Spindle_Average_Intensity,...,_x,CameraOffset_Katanin_Average,_y,CameraOffset_gTub_Average,3D_density_ratio_Katanin_p80,3D_density_ratio_gamma_Tubulin,3D_density_Poles_Katanin_p80,3D_density_Poles_gamma_Tubulin,Norm_Poles_Katanin_p80,Norm_Poles_gamma_Tubulin
0,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_001-1,402.265625,9.5,4.75,4.583528,1.279153,9.711462,273.421875,7.592105,1204.261158,...,1,572.851,1,605.448,4.125791,2.869063,18.867968,10.981536,8.652675,7.830553
1,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1,261.796875,12.0,1.75,19.749706,1.144612,9.503289,342.0,8.302632,2452.983279,...,1,694.992,1,753.509,3.56005,3.064726,17.30973,13.802496,,


In [10]:
# Parsing Cell Volume (ilastik) data

CV_dir_1 = root_dir + "Cell_volumes/4Ch/Measurements_4Ch_all/Correct"

def concat_cv(folder):
    dataframes = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            filepath = subdir + os.sep + file  
            if filepath.endswith(".csv"):
                single_df = pd.read_csv(filepath)
                dataframes.append(single_df)    
            else:
                pass
    df = pd.concat(dataframes)
    return df

CV_df = concat_cv(CV_dir_1)

In [11]:
# parse camera offsets of tubulin channel

offset_dir_1 = root_dir + "Cell_volumes/4Ch/Camera_offsets_20210222"
offset_dir_2 = root_dir + "Cell_volumes/4Ch/Camera_offsets_20210419"
offset_dir_3 = root_dir + "Cell_volumes/4Ch/Camera_offsets_20210726"

def concat_offset(folder):
    offset_dataframes = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            filepath = subdir + os.sep + file  
            if filepath.endswith(".csv"):
                single_df = pd.read_csv(filepath)
                offset_dataframes.append(single_df)    
            else:
                pass
    offset_df = pd.concat(offset_dataframes)
    return offset_df      

offsets_1 = concat_offset(offset_dir_1)
offsets_2 = concat_offset(offset_dir_2)
offsets_3 = concat_offset(offset_dir_3)
offsets = pd.concat([offsets_1, offsets_2, offsets_3])

CV_df = CV_df.merge(offsets, how = "inner")

In [12]:
# parse tubulin mass (total cell)
mass_dir_1 = root_dir + "Cell_volumes/4Ch/Measurements_total_tubulin_mass_all"

def concat_masses(folder):
    mass_dataframes = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            filepath = subdir + os.sep + file  
            if filepath.endswith(".csv"):
                single_df = pd.read_csv(filepath)
                mass_dataframes.append(single_df)    
            else:
                pass
    mass_df = pd.concat(mass_dataframes)
    return mass_df      

mass_df = concat_masses(mass_dir_1)
mass_df = mass_df[["Cell_ID", "Average", "Minimum", "StandardDeviation", "IntegratedDensity"]]
mass_df = mass_df.rename(columns={"Average": "Total_Tubulin_Average",
                                  "Minimum": "Total_Tubulin_Minimum",
                        "StandardDeviation": "Total_Tubulin_StD", 
                        "IntegratedDensity": "Total_Tubulin_Sum_Intensity"})

CV_df = pd.merge(CV_df, mass_df, on = "Cell_ID", how = "inner") 

df = df.merge(CV_df, on = "Cell_ID", how = "outer")

df["SSR"] = df.Spindle_Volume_um3 / df.Volume * 100

df["Total_Tubulin_Average_OffsetCorrected"] = df.Total_Tubulin_Average - df.Total_Tubulin_Minimum
df["Tubulin_Spindle_Average_Intensity_OffsetCorrected"] = df.Tubulin_Spindle_Average_Intensity - df.Total_Tubulin_Minimum


df["Spindle_Mass"] = df.Tubulin_Spindle_Average_Intensity_OffsetCorrected * df.Spindle_Volume_um3
df["Norm_Spindle_Mass"] = df.Spindle_Mass / df.Total_Tubulin_Average_OffsetCorrected
df["Total_tubulin_mass"] = df.Total_Tubulin_Average_OffsetCorrected * df.Volume
df["Cytoplasm_mass"] = df.Total_tubulin_mass - df.Spindle_Mass
df["Norm_Cytoplasm_mass"] = df.Cytoplasm_mass / df.Total_Tubulin_Average_OffsetCorrected
df["Cytoplasm_volume_um3"] = df.Volume - df.Spindle_Volume_um3

df["Tubulin_Fraction_in_Spindle"] = df.Spindle_Mass / df.Total_tubulin_mass * 100

df["Tubulin_density_spindle_norm"] = df.Norm_Spindle_Mass / df.Spindle_Volume_um3
df["Tubulin_density_cyto_norm"] = df.Norm_Cytoplasm_mass / df.Cytoplasm_volume_um3

In [13]:
# Load phenotyping classification

def concat_phenotypes(folder):
    phenotype_dataframes = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            cell_id = file.split(".")[0]
            phenotype = subdir.rsplit("/", 1)[1]
            single_df = pd.DataFrame(
                {"Cell_ID": cell_id, 
                 "Phenotype": phenotype}, 
                index = [0]
            )
            phenotype_dataframes.append(single_df)
    phenotype_df = pd.concat(phenotype_dataframes)
    return phenotype_df

phenotypes = concat_phenotypes(root_dir + "Phenotyping")
df = df.merge(phenotypes, on = "Cell_ID", how = "outer")
df.tail()

Unnamed: 0,Cell_ID,Chromatin_Volume_um3,MetaphasePlate_Length_um,MetaphasePlate_Width_um,Spindle_Angle_Degrees,Spindle_Aspect_Ratio,Spindle_Length_um,Spindle_Volume_um3,Spindle_Width_Avg_um,Tubulin_Spindle_Average_Intensity,...,Spindle_Mass,Norm_Spindle_Mass,Total_tubulin_mass,Cytoplasm_mass,Norm_Cytoplasm_mass,Cytoplasm_volume_um3,Tubulin_Fraction_in_Spindle,Tubulin_density_spindle_norm,Tubulin_density_cyto_norm,Phenotype
1203,20210419_R1E309_RNAi_04_gTub647_Kb1568_KKab1_026-1,,,,,,,,,,...,,,,,,,,,,Severe
1204,20210222_R1E309_RNAi_03_gTub647_KAT568_Scrmbl_040-3,,,,,,,,,,...,,,,,,,,,,Mild
1205,20210419_R1E309_RNAi_04_gTub647_Ka1568_KKab1_019-3,,,,,,,,,,...,,,,,,,,,,Mild
1206,20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_051-1,,,,,,,,,,...,,,,,,,,,,Mild
1207,20210726_R1E309_RNAi_07_gTub647_Kb1568_Katnb1_073-1,,,,,,,,,,...,,,,,,,,,,Mild


In [14]:
def unify_staining(x):
    if x == "Ka1568":
        x = "anti-Katna1"
    elif x == "Kb1568":
        x = "anti-Katnb1"
    else:
        x = "anti-Katnb1"
    return x

def pool_condition(x):
    if x == "KKab1":
        pool_condition = "Katnb1"
    elif x == "Scrmbl":
        pool_condition = "Control"
    else:
        pool_condition = x
    return pool_condition

df["Experiment"] = df.Cell_ID.str.split("_").str.get(0)
df["Condition"] = df.Cell_ID.str.split("_").str.get(6)
df["Pool_condition"] = df.Condition.apply(pool_condition)
df["Staining"] = df.Cell_ID.str.split("_").str.get(5) 
df["Staining"] = df.Staining.apply(unify_staining)

In [15]:
# dataset-internal normalisation
from sklearn import preprocessing

# Min-Max normalization
# Create a minimum and maximum processor object,
min_max_scaler = preprocessing.MinMaxScaler()

# Min-max normalize column  and concat with
# original dataframe
def normalise_column(column, data):
    list_of_datasets = ['20210222', '20210419', '20210726']
    dataframes = []
    for dataset in list_of_datasets:
        df = data[data["Experiment"] == dataset]
        x = df[[column]].values.astype(float)
        x = x.reshape(-1, 1)
        x_scaled = min_max_scaler.fit_transform(x)
        norm_df = pd.DataFrame(x_scaled)
        norm_df.columns = [column + "_normalised"]
        dataframe = pd.concat([df, norm_df], axis = 1)
        dataframes.append(dataframe)
    conc_dataframe = pd.concat(dataframes)
    return conc_dataframe

df = normalise_column("Norm_Poles_Katanin_p80", data = df)
df.head()

Unnamed: 0,Cell_ID,Chromatin_Volume_um3,MetaphasePlate_Length_um,MetaphasePlate_Width_um,Spindle_Angle_Degrees,Spindle_Aspect_Ratio,Spindle_Length_um,Spindle_Volume_um3,Spindle_Width_Avg_um,Tubulin_Spindle_Average_Intensity,...,Cytoplasm_volume_um3,Tubulin_Fraction_in_Spindle,Tubulin_density_spindle_norm,Tubulin_density_cyto_norm,Phenotype,Experiment,Condition,Pool_condition,Staining,Norm_Poles_Katanin_p80_normalised
0,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_001-1,402.265625,9.5,4.75,4.583528,1.279153,9.711462,273.421875,7.592105,1204.261158,...,2105.828125,39.307202,3.420416,0.685732,,20210222,Katna1,Katna1,anti-Katnb1,0.409961
1,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-1,261.796875,12.0,1.75,19.749706,1.144612,9.503289,342.0,8.302632,2452.983279,...,,,,,,20210222,Katna1,Katna1,anti-Katnb1,
2,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-2,546.375,14.0,3.0,34.486612,1.083255,10.362191,430.625,9.565789,1609.153628,...,,,,,,20210222,Katna1,Katna1,anti-Katnb1,
3,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_002-3,379.09375,13.5,2.25,8.326762,1.054552,8.325413,269.21875,7.894737,2728.681776,...,,,,,,20210222,Katna1,Katna1,anti-Katnb1,
4,20210222_R1E309_RNAi_03_gTub647_KAT568_Katna1_003-1,327.34375,11.5,2.75,11.701245,1.652601,11.002841,198.234375,6.657895,3614.552613,...,,,,,,20210222,Katna1,Katna1,anti-Katnb1,


In [16]:
# Binning

interval_range_CV = pd.interval_range(start = 2000, freq = 500, end = 4000)
df['Cell_Volume_bin'] = pd.cut(df['Volume'], bins = interval_range_CV).astype(str)

In [17]:
destination = "/Volumes/TOB_WD2/Image_Analysis/RNAi_Katanin/Dataframes" + "/MasterDataFrame_RNAi_AllSets4Ch.csv"
df.to_csv(destination)
print("Successfully saved dataframe to {}".format(destination))

Successfully saved dataframe to /Volumes/TOB_WD2/Image_Analysis/RNAi_Katanin/Dataframes/MasterDataFrame_RNAi_AllSets4Ch.csv
