In [1]:
import os 
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))



Morphometric and photometric data integration and post-processing 

In [2]:
# User-specified input dataframes locations

root = "/Volumes/TOB_WD2/Data_Analysis/DataFrames" + "/"
Spindle3D_df_Path = root + "MasterDataFrame_Spindle3D.csv"
Ilastik_df_Path = root + "MasterDataFrame_IlastikCV.csv"
SpaceTime_df_Path = root + "MasterDataFrame_SpaceTime.csv"

df_Spindle3D = pd.read_csv(Spindle3D_df_Path)
df_IlastikCV = pd.read_csv(Ilastik_df_Path)
df_SpaceTime = pd.read_csv(SpaceTime_df_Path)

print("The row count of df_Spindle3D before merging: " + str(df_Spindle3D.shape[0]))
print("The row count of df_IlastikCV before merging: " + str(df_IlastikCV.shape[0]))
print("The row count of df_SpaceTime before merging: " + str(df_SpaceTime.shape[0]))

The row count of df_Spindle3D before merging: 5942
The row count of df_IlastikCV before merging: 7611
The row count of df_SpaceTime before merging: 24726



The Spindle and cell morphometric dataframes will be combined by an outer merge
and only then merged with the SpaceTime data by an inner merge to kick out all
cell IDs that did not pass quality control.


In [3]:
df = df_Spindle3D.merge(
    right = df_IlastikCV, 
    on = "Cell_ID", 
    how = "outer"
)
print("Outer-Merged dataframes containing morphometric data.")

df = df_SpaceTime.merge(
    right = df, 
    on = "Cell_ID", 
    how = "inner"
)
print("Inner-Merged dataframes containing morphometric data.")

# clean up
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
print("The shape of the final df is: " + str(df.shape))

Outer-Merged dataframes containing morphometric data.
Inner-Merged dataframes containing morphometric data.
The shape of the final df is: (8398, 35)


In [4]:
df.columns

Index(['Cell_ID', 'LowZoom_ID', 'Dataset', 'Experiment', 'Position',
       'Condition', 'Has_duplicate', 'X_px', 'Y_px', 'Timepoint', 'Time',
       'Differentiation_mins', 'Differentiation_bins', 'Experiment_Time_mins',
       'index', 'Chromatin_Dilation', 'Chromatin_Volume_um3',
       'DNA_Volume_Threshold', 'MetaphasePlate_Length_um',
       'MetaphasePlate_Width_um', 'Spindle_Angle_Degrees',
       'Spindle_Aspect_Ratio', 'Spindle_DNA_Volume_Ratio', 'Spindle_Length_um',
       'Spindle_Volume_um3', 'Spindle_Width_Avg_um',
       'Tubulin_Spindle_Average_Intensity', 'Version', 'Cell_Volume_um3',
       'SurfaceArea', 'Sphericity', 'Tubulin_Cell_Average',
       'Tubulin_Cell_Minimum', 'Tubulin_Cell_Maximum', 'Tubulin_Cell_IntDen'],
      dtype='object')



Tubulin quantification has to be normalised by the tubulin-GFP expression level. 
As a proxy for tubulin-GFP concentration, we quantify the average cellular tubulin 
fluorescence (including both free and polymer tubulin).

We don't need to consider camera offsets, because the Zeiss LSM-derived images
Have an average offset value close to zero. 

In [5]:
# Tubulin quantification and normalisation

df["Tubulin_mass_cell"] = df.Tubulin_Cell_Average * df.Cell_Volume_um3 
df["Tubulin_mass_cell_norm"] = df.Tubulin_mass_cell / df.Tubulin_Cell_Average
df["Tubulin_mass_spindle"] = df.Tubulin_Spindle_Average_Intensity * df.Spindle_Volume_um3
df["Tubulin_mass_spindle_norm"] = df.Tubulin_mass_spindle / df.Tubulin_Cell_Average

df["Tubulin_mass_cytoplasm"] = df.Tubulin_mass_cell - df.Tubulin_mass_spindle
df["Tubulin_mass_cytoplasm_norm"] = df.Tubulin_mass_cytoplasm / df.Tubulin_Cell_Average
df["Tubulin_Cytop_Average"] = df.Tubulin_mass_cytoplasm * 1/(df.Cell_Volume_um3 - df.Spindle_Volume_um3)
df["Tubulin_density_cytop_norm"] = df.Tubulin_mass_cytoplasm_norm * 1/(df.Cell_Volume_um3 - df.Spindle_Volume_um3)

df["Fraction_Tubulin_in_Spindle"] = df.Tubulin_mass_spindle / df.Tubulin_mass_cell * 100
df["Fraction_SpindleVol_in_Cell"] = df.Spindle_Volume_um3 / df.Cell_Volume_um3 * 100

df["Tubulin_Averages_Ratio"] = df.Tubulin_Spindle_Average_Intensity / df.Tubulin_Cell_Average
df["Tubulin_Averages_Cytop_Ratio"] = df.Tubulin_Spindle_Average_Intensity / df.Tubulin_Cell_Average
df["Tubulin_density_spindle_norm"] = df.Tubulin_mass_spindle_norm / df.Spindle_Volume_um3 

df["SpindleVolume_ChromatinVolume_Ratio"] = df.Spindle_Volume_um3 / df.Chromatin_Volume_um3
df["Chromatin_Occupancy"] = df.Chromatin_Volume_um3 / df.Cell_Volume_um3 * 100
df["CellSurfaceArea_CellVolume_Ratio"] = df.SurfaceArea / df.Cell_Volume_um3

In [6]:
# Create Bins
interval_range_CV = pd.interval_range(start = 1000, freq = 500, end = 4000)
df['Cell_Volume_bin'] = pd.cut(df['Cell_Volume_um3'], bins = interval_range_CV).astype(str)

interval_range_SACV = pd.interval_range(start = 0.3, freq = 0.05, end = 0.5)
df['CellSurfaceArea_CellVolume_bin'] = pd.cut(df['CellSurfaceArea_CellVolume_Ratio'], bins = interval_range_SACV).astype(str)

interval_range_SV = pd.interval_range(start = 100, freq = 50, end = 600)
df['Spindle_Volume_bin'] = pd.cut(df['Spindle_Volume_um3'], bins = interval_range_SV).astype(str)

interval_range_SW = pd.interval_range(start = 5, freq = 2, end = 12)
df['Spindle_Width_bin'] = pd.cut(df['Spindle_Width_Avg_um'], bins = interval_range_SW).astype(str)

#interval_range_TubSpindle = pd.interval_range(start = 0, freq = 0.5, end = 7)
#df['Tubulin_density_spindle_bin'] = pd.cut(df['Tubulin_density_spindle_norm'], bins = interval_range_TubSpindle).astype(str)

#interval_range_Fraction_SpindleVol_in_Cell = pd.interval_range(start = 0, freq = 3, end = 25)
#df['Fraction_SpindleVol_in_Cell_bin'] = pd.cut(df['Fraction_SpindleVol_in_Cell'], bins = interval_range_Fraction_SpindleVol_in_Cell).astype(str)

#interval_range_TIME = pd.interval_range(start = 0, freq = 2, end = 12) # ??????????????
#df['Spindle_Width_bin'] = pd.cut(df['Spindle_Width_Avg_um'], bins = interval_range_SW).astype(str)

In [7]:
# kick out duplicate acquisitions of metaphase cells
# based on temperospatial proximity (Notebook 05_Coordinates)

print("The shape of the df before the purge: " + str(df.shape))

df = df[df.Has_duplicate == False]

print("The shape of the df after the purge: " + str(df.shape))

The shape of the df before the purge: (8398, 55)
The shape of the df after the purge: (7004, 55)


In [12]:
# Exclude outliers 
print(df.shape)

df_big_SV = df["Spindle_Volume_um3"] > 699
df_big_CV = df["Cell_Volume_um3"] > 4499
df_big_DNAW = df["MetaphasePlate_Width_um"] > 5
df_large_SSR = df["Fraction_SpindleVol_in_Cell"] > 25

df = df[~df_big_SV] # otherwise you get rid of the rows where Spindle_Volume_um3 is NaN (i.e ilastik output without Spindle 3D output)
df = df[~df_big_CV]
df = df[~df_big_DNAW]
df = df[~df_large_SSR]
df = df[(df["Spindle_Volume_um3"] > 100) | (df['Spindle_Volume_um3'].isna())] # eliminates overlooked mis-segmented cases
df = df[(df["Cell_Volume_um3"] > 1000) | (df['Cell_Volume_um3'].isna())] # eliminates overlooked mis-segmented cases

print(df.shape)

(7004, 57)
(6951, 57)


  df = df[~df_big_CV]
  df = df[~df_big_DNAW]
  df = df[~df_large_SSR]


In [16]:
subselection = [
    "Cell_ID", 
    "Condition", 
    "Spindle_Aspect_Ratio", 
    "Spindle_Length_um", 
    "Spindle_Width_Avg_um", 
    "Spindle_Volume_um3", 
    "Cell_Volume_um3", 
    "SurfaceArea", 
    "CellSurfaceArea_CellVolume_Ratio", 
    "Tubulin_mass_spindle_norm", 
    "Fraction_Tubulin_in_Spindle", 
    "Fraction_SpindleVol_in_Cell", 
    "Tubulin_density_spindle_norm"
]

sub_df = df[subselection]
df_outPath2 = root + "SubselectionDataFrame_Population_Morphometry.txt"
sub_df.to_csv(df_outPath2, sep = "\t")
print("Saved clean subselection dataframe to {}".format(df_outPath2))

Saved clean subselection dataframe to /Volumes/TOB_WD2/Data_Analysis/DataFrames/SubselectionDataFrame_Population_Morphometry.txt


In [17]:
outputDestination = root + "MasterDataFrame_Population_Morphometry.csv"

df.to_csv(outputDestination)
print("The final dataframe was exported to: " + outputDestination)

The final dataframe was exported to: /Volumes/TOB_WD2/Data_Analysis/DataFrames/MasterDataFrame_Population_Morphometry.csv
