In [11]:
import pandas as pd
import numpy as np
from collections import Counter

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [12]:
root = "/Volumes/TOB_WD2/Data_Analysis/DataFrames/Tracking" + "/"
inputFolder = root + "Input_Tracking" + "/"
outputFolder = root + "Output_Lineages" + "/"

def lineages(x, links_dataframe):
    # Parse lineage relationships between splitting events
    source_id = x.Source_ID # each row's source spot ID
    # add source id to list if the row is a splitting event
    if x.Splitting_event:
        lineage = [str(source_id)]
    else:
        lineage = []
    while True:
        target_id = links_dataframe.loc[links_dataframe['Target_ID'] == source_id,:] # find row containing corresponding target ID
        if target_id.empty:
            break
        if target_id.Splitting_event.values[0]:
            lineage.append(str(target_id.Source_ID.values[0]))
        source_id = target_id.Source_ID.values[0]
    if lineage:    
        return ".".join(reversed(lineage)) # insert . between every element in the list, in reverse order
    else:
        return None

def get_mother(x):
    lineage_list = x.split(".")
    if len(lineage_list) == 1:
        return None
    else:
        return lineage_list[-2] 

def get_grandmother(x):
    lineage_list = x.split(".")
    if len(lineage_list) < 3:
        return None
    else:
        return lineage_list[-3]     
    
def get_sister(x, dataframe):
    if x.Mother_ID is None:
        return None
    else:
        sister_df = dataframe.loc[
            (dataframe.Mother_ID == x.Mother_ID) & 
            (dataframe.Position == x.Position) &
            (dataframe.Source_ID != x.Source_ID), :
        ]
        if sister_df.shape[0] > 0:
            lineage_sister = sister_df.iloc[0]['Lineage']
            sister = lineage_sister.split(".")[-1]
            return sister
        else:
            pass

def seensister(x, already_seen_list):
    if x.Sister_ID is None:
            return None
    else:
        if x.Mother_ID in already_seen_list:
            return True
        else:
            already_seen_list.append(x.Mother_ID)
            return False

def get_aunt(x, dataframe):
    if x.Mother_ID is None:
        return None
    else:
        aunt_ID = dataframe.loc[dataframe.Source_ID == int(x.Mother_ID), "Sister_ID"]
        if aunt_ID is None:
            return None
        else:
            return aunt_ID.item()

def get_cousin(x, dataframe):
    if x.Grandmother_ID is None:
        return None
    else:
        cousins_df = dataframe.loc[
            (dataframe.Grandmother_ID == x.Grandmother_ID) & 
            (dataframe.Source_ID != x.Source_ID) &
            (dataframe.Mother_ID != x.Mother_ID), :
        ]
        cousins = []
        if cousins_df.shape[0] == 2:   
            cousinA = cousins_df.iloc[0, 1].item()
            cousinB = cousins_df.iloc[1, 1].item()
            return cousinA, cousinB
        elif cousins_df.shape[0] == 1:
            cousinA = cousins_df.iloc[0, 1].item()
            return cousinA
        else:
            return None

def get_random(x, dataframe):
    position = x.Position
    random_df = dataframe.loc[
            (dataframe.Position == position) &
            (dataframe.Track_ID != x.Track_ID) # Don't pair within same lineage
           # (dataframe.Source_ID != x.Sister_ID) & 
           # (dataframe.Source_ID != x.Mother_ID) & 
           # (dataframe.Source_ID != x.Grandmother_ID) & 
           # (dataframe.Source_ID != x.Cousin_ID) & 
           # (dataframe.Source_ID != x.Aunt_ID), :
        ]
    
    random_ID = random_df["Source_ID"].sample(1)
    return random_ID.item()

def seengranny(x, granny_seen_list):
    if x.Grandmother_ID is None:
        return None
    else:
        if x.Grandmother_ID not in granny_seen_list:
            granny_seen_list.append(x.Grandmother_ID)
            return False
        else:
            return True
        
#def getDataset(x):
    # annotate parent dataset (important for 
    # interrupted experiments)
#    experiment = x.Experiment
    
#    if experiment == "20200725-142832":
#        return "20200724"
#    elif experiment in ["20200802-104739", "20200803-181830"]:
#        print(experiment)
#        return "20200731"
#    elif experiment in ["20200809-114236", "20200810-225504"]:
#        return "20200807"
#    else:
#        return experiment.split("-")[0]

def correct_subexperiment(x, last_timepoint1, last_timepoint2, old_subexperiment, new_subexperiment1, new_subexperiment2):
    timepoint = x
    if last_timepoint2 >= timepoint > last_timepoint1:
        return new_subexperiment1
    elif timepoint > last_timepoint2:
        return new_subexperiment2
    else: 
        return old_subexperiment
    
def tracking_dataframes(datadir, dataset, position, outdir = outputFolder):
    links = pd.read_csv(
        datadir + "{}-P000{}_Links_in_tracks_statistics.csv".format(dataset, position),
        usecols = ['TRACK_ID', 'SPOT_SOURCE_ID', 'SPOT_TARGET_ID']
    )
    links.rename(columns = {
        "TRACK_ID": "Track_ID", 
        "SPOT_SOURCE_ID": "Source_ID", 
        "SPOT_TARGET_ID": "Target_ID"
    }, inplace = True)
    
    spots = pd.read_csv(
        datadir + "{}-P000{}_Spots_in_tracks_statistics.csv".format(dataset, position),
        usecols = ['ID', 'TRACK_ID', 'POSITION_X', 'POSITION_Y', 'FRAME']
    )
    spots.rename(columns = {
        "TRACK_ID": "Track_ID", 
        "POSITION_X": "Track_Coordinate_X", 
        "POSITION_Y": "Track_Coordinate_Y", 
        "FRAME": "Frame"
    }, inplace = True)
    
    # Generate a list of spot_ids that correspond to a splitting event
    # (Per definition, a splitting event is labelled twice as "source")
    source_ids = list(links["Source_ID"])
    source_id_counts = Counter(source_ids)
    splitting_event_ids = [id for id in source_id_counts if source_id_counts[id] > 1]
    
    # Add Boolean to Spots and Links dataframes
    # that indicate whether the spot or links belongs to a 
    # splitting event

    spots["Splitting_event"] = spots["ID"].apply(lambda x:\
                                                 False if x not in splitting_event_ids\
                                                 else True)

    links["Splitting_event"] = links["Source_ID"].apply(lambda x:\
                                                 False if x not in splitting_event_ids\
                                                 else True)

    

    links['Lineage'] = links.apply(lineages, args = (links,), axis = 1)
    print("Successfully annotated lineage information.")

    links = links[links["Splitting_event"] == True]
    spots = spots[spots["Splitting_event"] == True]
    spots.rename(columns = {"ID": "Source_ID"}, inplace = True)
    
    df = pd.merge(links, spots, how = "outer", on = ["Source_ID", "Track_ID", "Splitting_event"])
    
    df.drop(["Target_ID"], axis = 1, inplace = True)
    df.drop_duplicates(subset = ['Source_ID'], inplace = True)
    
    df["Timepoint"] = df.Frame.apply(lambda x: int(x + 1)) # Frames (Trackmate) are 0 index
    df["Position"] = position
    df["Experiment"] = dataset
    
    df["Dataset"] = df.Experiment.str.split("-").str.get(0)
    
    #print(df)
    
    if dataset == "20200724-201611":
        df["Experiment"] = df.Timepoint.apply(
            correct_subexperiment, 
            last_timepoint1 = 93, 
            last_timepoint2 = 93, 
            old_subexperiment = "20200724-201611",
            new_subexperiment1 = "20200725-142832",
            new_subexperiment2 = "20200725-142832"
        )

    if dataset == "20200731-175845":
        df["Experiment"] = df.Timepoint.apply(
            correct_subexperiment, 
            last_timepoint1 = 225,
            last_timepoint2 = 374,
            old_subexperiment = "20200731-175845", 
            new_subexperiment1 = "20200802-104739",
            new_subexperiment2 = "20200803-181830"
        )

    if dataset == "20200807-174159":
        df["Experiment"] = df.Timepoint.apply(
            correct_subexperiment, 
            last_timepoint1 = 199,
            last_timepoint2 = 378,
            old_subexperiment = "20200807-174159", 
            new_subexperiment1 = "20200809-114236",
            new_subexperiment2 = "20200810-225504"
        )
        
    
    df["Generation"] = df["Lineage"].apply(lambda x: x.count(".") + 1)
    df["Mother_ID"] = df["Lineage"].apply(lambda x: get_mother(x))
    df["Grandmother_ID"] = df["Lineage"].apply(lambda x: get_grandmother(x))
    df["Sister_ID"] = df.apply(get_sister, dataframe = df, axis = 1)
    
    # Randomising order of dataframe to 
    # avoid that sisters with shortest cell cycles
    # are 'False' for Seen_sister or seen_granny.
    df = df.sample(frac = 1) # shuffle rows
    
    # This will allow to sample one cell out of sister pair
    seen_sister_list = []
    print("Populating sister list")
    df["Seen_sister"] = df.apply(seensister, already_seen_list = seen_sister_list, axis = 1)    
    df["Aunt_ID"] = df.apply(get_aunt, dataframe = df, axis = 1)
    df["Cousin_ID"] = df.apply(get_cousin, dataframe = df, axis = 1)
    df["Random_ID"] = df.apply(get_random, dataframe = df, axis = 1)
    
    # This will allow to sample one cell out of cousin quartett
    seen_granny_list = []
    print("Populating granny list")
    df["Seen_granny"] = df.apply(seengranny, granny_seen_list = seen_granny_list, axis = 1) 
    
    # sort back to index order after shuffling
    df = df.sort_index()
    destination = outdir + "{}_P{}_lineages.csv".format(dataset, position)
    df.to_csv(destination)
    print("Successfully exported lineage dataframe to " + destination)
    return df

In [13]:
df_0227_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20190227-182447", position = "1")
df_0227_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20190227-182447", position = "2")

df_0618_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20190618-conc", position = "1")
df_0618_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20190618-conc", position = "2")

df_0827_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20190827-181922", position = "1")
df_0827_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20190827-181922", position = "2")

df_0724_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20200724-201611", position = "1")
df_0724_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20200724-201611", position = "2")

#     df_0728_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20200728-174144", position = "1")
df_0728_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20200728-174144", position = "2")

df_0730_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20200730-190931", position = "1")
#     df_0730_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20200730-190931", position = "2")

df_0731_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20200731-175845", position = "1")
df_0731_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20200731-175845", position = "2")

#     df_0807_p1 = tracking_dataframes(datadir = inputFolder, dataset = "20200807-174159", position = "1")
df_0807_p2 = tracking_dataframes(datadir = inputFolder, dataset = "20200807-174159", position = "2")
#     df_0807_p3 = tracking_dataframes(datadir = inputFolder, dataset = "20200807-174159", position = "3")
#      df_0807_p4 = tracking_dataframes(datadir = inputFolder, dataset = "20200807-174159", position = "4")

Successfully annotated lineage information.
Populating sister list
Populating granny list
Successfully exported lineage dataframe to /Volumes/TOB_WD2/Data_Analysis/DataFrames/Tracking/Output_Lineages/20190227-182447_P1_lineages.csv
Successfully annotated lineage information.
Populating sister list
Populating granny list
Successfully exported lineage dataframe to /Volumes/TOB_WD2/Data_Analysis/DataFrames/Tracking/Output_Lineages/20190227-182447_P2_lineages.csv
Successfully annotated lineage information.
Populating sister list
Populating granny list
Successfully exported lineage dataframe to /Volumes/TOB_WD2/Data_Analysis/DataFrames/Tracking/Output_Lineages/20190618-conc_P1_lineages.csv
Successfully annotated lineage information.
Populating sister list
Populating granny list
Successfully exported lineage dataframe to /Volumes/TOB_WD2/Data_Analysis/DataFrames/Tracking/Output_Lineages/20190618-conc_P2_lineages.csv
Successfully annotated lineage information.
Populating sister list
Populatin

In [14]:
dataframes = [
    df_0227_p1, 
    df_0227_p2, 
    df_0618_p1, 
    df_0618_p2, 
    df_0827_p1, 
    df_0827_p2, 
    df_0724_p1, 
    df_0724_p2, 
    df_0728_p2, 
    df_0730_p1, 
    df_0731_p1, 
    df_0731_p2, 
    df_0807_p2
]

In [15]:
from statistics import mean

def get_lineage_statistics(dataframelist = dataframes, outDir = outputFolder):
    dataframes_for_concat = []
        
    for data in dataframelist:
        dataset = data.loc[0, "Dataset"]
        position = data.loc[0, "Position"]
        max_generation = data.Generation.max()
        No_of_source_ids = data.Source_ID.dropna().shape[0]
        No_of_random_ids = data.Random_ID.dropna().shape[0]
        No_of_sister_ids = data.Sister_ID.dropna().shape[0]
        No_of_mother_ids = data.Mother_ID.dropna().shape[0]
        No_of_aunt_ids = data.Aunt_ID.dropna().shape[0]
        No_of_cousin_ids = data.Cousin_ID.dropna().shape[0]
        No_of_grandmother_ids = data.Grandmother_ID.dropna().shape[0]
        
        list_of_tracks = data.Track_ID.unique()
        list_of_trackSubData = []
        for track in list_of_tracks:
            sub_data = data.loc[data.Track_ID == track]
            list_of_trackSubData.append(sub_data)

        No_of_tracks = data.Track_ID.nunique()
        Average_No_of_splitting_events = data.groupby("Track_ID").describe().count().mean()

        sister_per_split = []
        mother_per_split = []
        grandmother_per_split = []
        aunt_per_split = []
        cousins_per_split = []

        for sub_data in list_of_trackSubData: 
            number_of_sister_ids = sub_data.Sister_ID.dropna().shape[0]
            sis_per_total_splits = number_of_sister_ids / sub_data.shape[0]
            sister_per_split.append(sis_per_total_splits)

            number_of_mother_ids = sub_data.Mother_ID.dropna().shape[0]
            mom_per_total_splits = number_of_mother_ids / sub_data.shape[0]
            mother_per_split.append(mom_per_total_splits)

            number_of_gmother_ids = sub_data.Grandmother_ID.dropna().shape[0]
            gmom_per_total_splits = number_of_gmother_ids / sub_data.shape[0]
            grandmother_per_split.append(gmom_per_total_splits)

            number_of_aunt_ids = sub_data.Aunt_ID.dropna().shape[0]
            aunt_per_total_splits = number_of_aunt_ids / sub_data.shape[0]
            aunt_per_split.append(aunt_per_total_splits)

            number_of_cousin_ids = sub_data.Cousin_ID.dropna().shape[0]
            cousin_per_total_splits = number_of_cousin_ids / sub_data.shape[0]
            cousins_per_split.append(cousin_per_total_splits)

        statistics_dict = {"Dataset": dataset, 
                           "Position": position, 
                           "Number_of_tracks": No_of_tracks,
                           "Max_No_of_Generations": max_generation,
                           "Number_of_Source_IDs": No_of_source_ids,
                           "Number_of_Sister_IDs": No_of_sister_ids,
                           "Number_of_Cousin_IDs": No_of_cousin_ids,
                           "Number_of_Mother_IDs": No_of_mother_ids,
                           "Number_of_Aunt_IDs": No_of_aunt_ids,
                           "Number_of_Grandmother_IDs": No_of_grandmother_ids,
                           "Number_of_Random_IDs": No_of_random_ids,
                           "Average_No_of_SplittingEvents_per_Track": data.shape[0] / No_of_tracks, 
                           "Average_No_of_Sisters_per_Splitting_event": mean(sister_per_split), 
                           "Average_No_of_Mothers_per_Splitting_event": mean(mother_per_split), 
                           "Average_No_of_Grandmothers_per_Splitting_event": mean(grandmother_per_split), 
                           "Average_No_of_Aunts_per_Splitting_event": mean(aunt_per_split), 
                           "Average_No_of_Cousin_per_Splitting_event": mean(cousins_per_split)
                          }
        df = pd.DataFrame(statistics_dict, index = [0])
        dataframes_for_concat.append(df)
    
    final_df = pd.concat(dataframes_for_concat)
    return final_df

statistics_df = get_lineage_statistics()
statistics_df

Unnamed: 0,Dataset,Position,Number_of_tracks,Max_No_of_Generations,Number_of_Source_IDs,Number_of_Sister_IDs,Number_of_Cousin_IDs,Number_of_Mother_IDs,Number_of_Aunt_IDs,Number_of_Grandmother_IDs,Number_of_Random_IDs,Average_No_of_SplittingEvents_per_Track,Average_No_of_Sisters_per_Splitting_event,Average_No_of_Mothers_per_Splitting_event,Average_No_of_Grandmothers_per_Splitting_event,Average_No_of_Aunts_per_Splitting_event,Average_No_of_Cousin_per_Splitting_event
0,20190227,1,25,7,652,580,531,627,563,579,652,26.08,0.852528,0.939052,0.825793,0.793158,0.722729
0,20190227,2,36,5,318,238,174,264,193,203,318,8.833333,0.588667,0.67517,0.468405,0.442557,0.390039
0,20190618,1,33,8,654,520,490,621,537,570,654,19.818182,0.588093,0.745848,0.538699,0.496309,0.430761
0,20190618,2,43,7,595,466,365,552,438,477,595,13.837209,0.654496,0.815113,0.614717,0.524985,0.398562
0,20190827,1,24,7,462,400,372,436,400,406,462,19.25,0.523893,0.598117,0.456574,0.441421,0.413252
0,20190827,2,177,8,1079,628,420,848,533,617,1079,6.096045,0.335369,0.514894,0.266064,0.213633,0.154057
0,20200724,1,143,6,523,312,125,380,151,171,523,3.657343,0.456677,0.590335,0.198868,0.168232,0.132992
0,20200724,2,138,5,455,262,114,317,150,174,455,3.297101,0.335157,0.436767,0.172388,0.140849,0.094459
0,20200728,2,26,2,59,20,0,33,0,0,59,2.269231,0.25641,0.50641,0.0,0.0,0.0
0,20200730,1,110,2,134,12,0,24,0,0,134,1.218182,0.036364,0.090909,0.0,0.0,0.0


In [16]:
statistics_df.to_csv("/Volumes/TOB_WD2/Data_Analysis/DataFrames/MetaStatistics/MetaStatistics_TrackingLineages.csv")
print("Finished saving the metastatistics for tracking.")

Finished saving the metastatistics for tracking.
