In [79]:
import os
import pandas as pd
import numpy as np
import itertools

# Display larger dataframes
pd.set_option('display.max_columns', 500)
pd.options.display.max_colwidth = 100

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))



 Assemble a main coordinate dataframe that contains all XY coordinates for 
 ALL highzoom events (true positives and false positives)

In [80]:
# Parse coordinate dataframes of early experiments

coord_0227 = pd.read_csv("/Volumes/TOB_WD2/Microscopy/201902_EMBL/20190227-182447--analysis_OWN/jsonTable.csv")
coord_0618 = pd.read_csv("/Volumes/TOB_WD2/Microscopy/201906_EMBL/auto--20190618/20190618-conc--analysis_OWN/jsonTable.csv")
coord_0827 = pd.read_csv("/Volumes/TOB_WD2/Microscopy/201908_EMBL/auto--20190827/20190827-181922--analysis--OWN/jsonTable.csv")    

In [81]:
def clean_cell_id(x, date_string):
    if len(x) < 28:
        name_complete = x + "--0000"
    else:
        name_complete = x
    cell_id = date_string + "_" + name_complete
    return cell_id

coord_0227["Cell_ID"] = coord_0227.Cell_ID.apply(clean_cell_id, date_string = "20190227-182447")
coord_0227 = coord_0227.rename(columns = {"X": "X_px", "Y": "Y_px"})
coord_0618["Cell_ID"] = coord_0618.Cell_ID.apply(clean_cell_id, date_string = "20190618-conc")
coord_0618 = coord_0618.rename(columns = {"X": "X_px", "Y": "Y_px"})
coord_0827["Cell_ID"] = coord_0827.Cell_ID.apply(clean_cell_id, date_string = "20190827-181922")
coord_0827 = coord_0827.rename(columns = {"X": "X_px", "Y": "Y_px"})
coord_0827.head()

Unnamed: 0,X_px,Y_px,Well,Position,Timepoint,Subposition,Cell_ID
0,960.95279,1164.755365,0,1,40,0,20190827-181922_HighZoom--W0000--P0001-T0040--0000
1,1091.001453,637.604651,0,1,46,0,20190827-181922_HighZoom--W0000--P0001-T0046--0000
2,1090.860666,636.489519,0,1,47,0,20190827-181922_HighZoom--W0000--P0001-T0047--0000
3,1108.178201,519.377163,0,1,48,0,20190827-181922_HighZoom--W0000--P0001-T0048--0000
4,990.462025,1151.027426,0,1,48,1,20190827-181922_HighZoom--W0000--P0001-T0048--0001


In [82]:
# Parse coordinate dataframes of new experiments
inputFolder = "/Volumes/TOB_WD2/Data_Analysis/DataFrames/Coordinates/Pre/calcPositions-offsets"
outputFolder = "/Volumes/TOB_WD2/Data_Analysis/DataFrames/"
subselection = ["FileName_HighZoom_IMG", "Offset.X.Px_NUM", "Offset.Y.Px_NUM"]

def clean_cell_id_2(x):
    # Brings all cell id names to same length
    ID = x.split(".")[0]
    if len(ID) == 28:
        return ID + "--0000"
    else:
        return ID 

def get_Timepoint(x):
    time = x[24:28]
    Timepoint = int(time)
    return Timepoint

def get_Position(x):
    position = x[21]
    Position = int(position)
    return Position

def get_Experiment(x):
    Experiment = x.split("_",1)[0]
    return Experiment

def parse_coordinates(folder):
    # Concatenates all "offset" coordinate tables
    for root, dirs, files in os.walk(folder):
        files = [f for f in files if not f[0] == '.']
        
        dataframes = []
        for file in files:
            filepath = root + os.sep + file
            date_string = file.split("_", 1)[0]
            df = pd.read_csv(filepath, delimiter="\t")
            df = df[subselection]
            df = df.rename(columns={
                "FileName_HighZoom_IMG": "Cell_ID", 
                "Offset.X.Px_NUM": "X_px", "Offset.Y.Px_NUM": "Y_px"
            })
            df["Cell_ID"] = df.Cell_ID.apply(clean_cell_id_2)
            df["Timepoint"] = df.Cell_ID.apply(get_Timepoint)
            df["Position"] = df.Cell_ID.apply(get_Position)
            df["Cell_ID"] = df.Cell_ID.apply(lambda x: date_string + "_" + x)
            dataframes.append(df)
        conc_df = pd.concat(dataframes)
        return conc_df
    
df = parse_coordinates(inputFolder)
df.head()

Unnamed: 0,Cell_ID,X_px,Y_px,Timepoint,Position
0,20200724-201611_HighZoom--W0000--P0002-T0002--0000,789.324602,876.134963,2,2
1,20200724-201611_HighZoom--W0000--P0002-T0002--0001,1022.868405,995.492108,2,2
2,20200724-201611_HighZoom--W0000--P0002-T0003--0000,1840.645159,724.591357,3,2
3,20200724-201611_HighZoom--W0000--P0002-T0004--0000,1839.087281,728.982667,4,2
4,20200724-201611_HighZoom--W0000--P0002-T0005--0000,1839.792434,730.979699,5,2


In [83]:
# parse remaining coordinates (0803, 0809, 0810)
# (these were generated separately using a custom script:
# "Parse_Coordinates_via_JSON")

coord_0803 = pd.read_csv("/Volumes/TOB_WD2/Data_Analysis/DataFrames/Coordinates/Pre/20200803-181830_coordinates.csv")
coord_0809 = pd.read_csv("/Volumes/TOB_WD2/Data_Analysis/DataFrames/Coordinates/Pre/20200809-114236_coordinates.csv")
coord_0810 = pd.read_csv("/Volumes/TOB_WD2/Data_Analysis/DataFrames/Coordinates/Pre/20200810-225504_coordinates.csv")

coord_df = pd.concat([coord_0227, coord_0618, coord_0827, df, coord_0803, coord_0809, coord_0810])

In [84]:
coord_df["Experiment"] = coord_df.Cell_ID.apply(get_Experiment)
final_subselection = ["Cell_ID", "Experiment", "Timepoint", "Position", "X_px", "Y_px"]
coord_df = coord_df[final_subselection]
coord_df = coord_df.sort_values(by = "Cell_ID", ascending = True)
coord_df.shape

(25414, 6)

In [87]:
# Identify cells whose spindles were
# imaged multiple times 

def identify_duplicates(df):
    processed_dfs = []
    
    # before matching, slice main dataframe into experiment sub-dataframes
    experiment_names = df.Experiment.unique()
    print(experiment_names)
    
    for experiment_name in experiment_names:
        print("Starting analysis of {}.".format(experiment_name))
        sub_df = df[df["Experiment"] == experiment_name]
        coordinate_tuples = [tuple(x) for x in sub_df.values]

        is_duplicate = []
        for a, b in itertools.combinations(coordinate_tuples, 2):
            if a[3] == b[3] and np.isclose(  # Position
                np.array(a[2]), np.array(b[2]), atol = 3.0  # Timepoints 
            ) == np.array([ True ]) and np.isclose(
                np.array(a[4]), np.array(b[4]), atol = 20.0   # X-Coordinates (PIXEL COORDINATES)
            ) == np.array([ True ]) and np.isclose(
                np.array(a[5]), np.array(b[5]), atol = 20.0  # y-Coordinates (PIXEL COORDINATES)
            ) == np.array([ True ]):
                is_duplicate.append(b[0]) # Cell_Id 
        
        is_duplicate.sort()
        unique_is_duplicate = set(is_duplicate)

        is_duplicate_bool = []
        for id in df["Cell_ID"]:
            if id in unique_is_duplicate:
                is_duplicate_bool.append((id, True))
            else:
                is_duplicate_bool.append((id, False))

        is_duplicate_df = pd.DataFrame(is_duplicate_bool, columns = ["Cell_ID", "Has_duplicate"])
        df_merged = sub_df.merge(is_duplicate_df, on = "Cell_ID")
        processed_dfs.append(df_merged)
        print("Successfully identified duplicate events in {}".format(experiment_name))
    final_df = pd.concat(processed_dfs)
    print("Finished analysis of all experiments")
    return final_df

df = identify_duplicates(coord_df)
df.head()

['20190227-182447' '20190618-conc' '20190827-181922' '20200724-201611'
 '20200725-142832' '20200728-174144' '20200730-190931' '20200731-175845'
 '20200802-104739' '20200803-181830' '20200806-164803' '20200807-174159'
 '20200809-114236' '20200810-225504']
Starting analysis of 20190227-182447.
Successfully identified duplicate events in 20190227-182447
Starting analysis of 20190618-conc.
Successfully identified duplicate events in 20190618-conc
Starting analysis of 20190827-181922.
Successfully identified duplicate events in 20190827-181922
Starting analysis of 20200724-201611.
Successfully identified duplicate events in 20200724-201611
Starting analysis of 20200725-142832.
Successfully identified duplicate events in 20200725-142832
Starting analysis of 20200728-174144.
Successfully identified duplicate events in 20200728-174144
Starting analysis of 20200730-190931.
Successfully identified duplicate events in 20200730-190931
Starting analysis of 20200731-175845.
Successfully identified d

Unnamed: 0,Cell_ID,Experiment,Timepoint,Position,X_px,Y_px,Has_duplicate
0,20190227-182447_HighZoom--W0000--P0001-T0004--0000,20190227-182447,4,1,937.224719,634.629213,False
1,20190227-182447_HighZoom--W0000--P0001-T0004--0001,20190227-182447,4,1,173.120841,1222.898424,False
2,20190227-182447_HighZoom--W0000--P0001-T0006--0000,20190227-182447,6,1,641.701997,310.236559,False
3,20190227-182447_HighZoom--W0000--P0001-T0007--0000,20190227-182447,7,1,641.594142,308.518828,True
4,20190227-182447_HighZoom--W0000--P0001-T0008--0000,20190227-182447,8,1,638.776012,307.822254,True


In [88]:
df.to_csv(outputFolder + "MasterDataFrame_Coordinates.csv")