In [42]:
import pandas as pd
import os
import math

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [43]:
root = "/Volumes/TOB_WD2/Data_Analysis/DataFrames" + "/"
lineages = root + "/Tracking/Output_Lineages" + "/"

def conc_lineage_df(in_dir):
    dataframes = []
    for subdir, dirs, files in os.walk(in_dir):
        for file in files:
            if not file.startswith("."):
                filepath = subdir + os.sep + file  
                if filepath.endswith(".csv"):
                    single_df = pd.read_csv(filepath)
                    dataframes.append(single_df)    
                else:
                    pass
            else:
                pass
    df = pd.concat(dataframes)
    return df
     
lineages_df = conc_lineage_df(lineages)
times_df = pd.read_csv(root + "MasterDataFrame_Times.csv")

In [44]:
# Preparing times for merging 
def parent_dataset(x):
        datasets = [
            "20190227-182447", 
            "20190618-conc", 
            "20190827-181922",
            "20200724-201611",
            "20200728-174144", 
            "20200730-190931",
            "20200731-175845",
            "20200806-164803",
            "20200807-174159"
        ]

        if x in datasets:
            return x.split("-")[0]

        elif (x == "20200802-104739") or (x == "20200803-181830"):
            return "20200731"
        elif x == "20200725-142832":
            return "20200724"
        elif (x == "20200809-114236") or (x == "20200810-225504"):
            return "20200807"
        else:
            print("Does not apply?: " + x)

def correct_timepoints(x):
    subexperiment_key = x.Subexperiment
    timepoint = x.Timepoint
    timepoint_dir = {"20200725-142832": 93, 
                     "20200802-104739": 225, 
                     "20200803-181830": 374, 
                     "20200809-114236": 199, 
                     "20200810-225504": 378
                    }
    if subexperiment_key in timepoint_dir:
        new_timepoint = timepoint + timepoint_dir[subexperiment_key]
        return new_timepoint
    else:
        return timepoint

            
lineages_df.rename(columns = {"Experiment": "Subexperiment"}, inplace = True)
lineages_df["Experiment"] = lineages_df.Subexperiment.apply(parent_dataset)
print(lineages_df.Experiment.unique())
times_df["Subexperiment"] = times_df.LowZoom_ID.str.split("_").str.get(0)
times_df["Experiment"] = times_df.Subexperiment.apply(parent_dataset)  

times_df["Timepoint"] = times_df.LowZoom_ID.apply(lambda x: int(x[-4:]))
times_df["Timepoint"] = times_df.apply(correct_timepoints, axis = 1)
times_df["Position"] = times_df.LowZoom_ID.apply(lambda x: int(x[-7]))
destination_times = root + "MasterDataFrame_Times_II.csv"
times_df.to_csv(destination_times)

['20200724' '20190227' '20190618' '20190827' '20200731' '20200730'
 '20200807' '20200728']


In [45]:
print(lineages_df.Subexperiment.unique())
print(times_df.Subexperiment.unique())

['20200724-201611' '20200725-142832' '20190227-182447' '20190618-conc'
 '20190827-181922' '20200731-175845' '20200802-104739' '20200803-181830'
 '20200730-190931' '20200807-174159' '20200809-114236' '20200728-174144']
['20190227-182447' '20190827-181922' '20200724-201611' '20200725-142832'
 '20200728-174144' '20200730-190931' '20200731-175845' '20200802-104739'
 '20200803-181830' '20200806-164803' '20200807-174159' '20200809-114236'
 '20200810-225504' '20190618-conc']


In [46]:
df = times_df.merge(lineages_df, how = "inner", on = ["Timepoint", "Experiment", "Subexperiment", "Position"]) 
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

destination = root + "MasterDataFrame_MergeLineagesTimes.csv"
df.to_csv(destination)

In [47]:
# Quality control meta statistics after merging 
# Times and Lineages

from statistics import mean

def get_lineage_statistics(dataframe = df):
    dataframelist = dataframe.Experiment.unique()
    positions = [1, 2]
    dataframes_for_concat = []
    for data_name in dataframelist:
        for position in positions:    

            data = dataframe[(dataframe["Experiment"] == data_name) & (dataframe["Position"] == position)]
            dataset = data_name
            position = position
            max_generation = data.Generation.max()
            No_of_source_ids = data.Source_ID.dropna().shape[0]
            No_of_sister_ids = data.Sister_ID.dropna().shape[0]
            No_of_mother_ids = data.Mother_ID.dropna().shape[0]
            No_of_aunt_ids = data.Aunt_ID.dropna().shape[0]
            No_of_cousin_ids = data.Cousin_ID.dropna().shape[0]
            No_of_grandmother_ids = data.Grandmother_ID.dropna().shape[0]

            statistics_dict = {"Dataset": dataset, 
                               "Position": position, 
                               "Max_No_of_Generations": max_generation,
                               "Number_of_Source_IDs": No_of_source_ids,
                               "Number_of_Sister_IDs": No_of_sister_ids,
                               "Number_of_Cousin_IDs": No_of_cousin_ids,
                               "Number_of_Mother_IDs": No_of_mother_ids,
                               "Number_of_Aunt_IDs": No_of_aunt_ids,
                               "Number_of_Grandmother_IDs": No_of_grandmother_ids,
                              }
            df = pd.DataFrame(statistics_dict, index = [0])
            dataframes_for_concat.append(df)
    
    final_df = pd.concat(dataframes_for_concat)
    return final_df

statistics_df = get_lineage_statistics()
statistics_df

Unnamed: 0,Dataset,Position,Max_No_of_Generations,Number_of_Source_IDs,Number_of_Sister_IDs,Number_of_Cousin_IDs,Number_of_Mother_IDs,Number_of_Aunt_IDs,Number_of_Grandmother_IDs
0,20190227,1,7.0,652,580,531,627,563,579
0,20190227,2,5.0,318,238,174,264,193,203
0,20190827,1,7.0,462,400,372,436,400,406
0,20190827,2,8.0,1079,628,420,848,533,617
0,20200724,1,6.0,523,312,125,380,151,171
0,20200724,2,5.0,455,262,114,317,150,174
0,20200728,1,,0,0,0,0,0,0
0,20200728,2,2.0,59,20,0,33,0,0
0,20200730,1,2.0,134,12,0,24,0,0
0,20200730,2,,0,0,0,0,0,0


In [48]:
def get_cell_cycle(x, dataframe):
    daughter_time = x.Experiment_Time_mins
    if x.Generation < 2:
        return None
    else:
        mother_row = dataframe.loc[
            (dataframe.Experiment == x.Experiment) &
 #           (dataframe.Subexperiment == x.Subexperiment) & #mothers can be in older subexperiments!
            (dataframe.Position == x.Position) &
            (dataframe.Source_ID == x.Mother_ID)
        ]
        if mother_row["Experiment_Time_mins"].shape[0] == 1:   
            mother_time = mother_row["Experiment_Time_mins"].item()
            cell_cycle = daughter_time - mother_time
            return cell_cycle
        else:
            return None

df["Cell_Cycle_mins"] = df.apply(get_cell_cycle, dataframe = df, axis = 1)

print("Finished calculating cell cycles.")
df.tail()

Finished calculating cell cycles.


Unnamed: 0,Time,LowZoom_ID,Timestamp,Experiment_StartTime_mins,Experiment_Time_mins,Subexperiment,Experiment,Timepoint,Position,Track_ID,...,Generation,Mother_ID,Grandmother_ID,Sister_ID,Seen_sister,Aunt_ID,Cousin_ID,Random_ID,Seen_granny,Cell_Cycle_mins
5906,2019/06/22 10:11:53,20190618-conc_LowZoom--W0000--P0001-T0370,1561191000.0,1560879000.0,5194.05,20190618-conc,20190618,370,1,4,...,7,2032.0,1993.0,2042.0,False,2003.0,2016,3207,False,1282.516667
5907,2019/06/22 10:11:53,20190618-conc_LowZoom--W0000--P0001-T0370,1561191000.0,1560879000.0,5194.05,20190618-conc,20190618,370,1,5,...,6,2941.0,2928.0,2970.0,True,2995.0,,666,True,980.366667
5908,2019/06/22 10:11:53,20190618-conc_LowZoom--W0000--P0001-T0370,1561191000.0,1560879000.0,5194.05,20190618-conc,20190618,370,1,6,...,8,4423.0,4400.0,4433.0,False,4410.0,,286,False,756.266667
5909,2019/06/22 10:11:53,20190618-conc_LowZoom--W0000--P0001-T0370,1561191000.0,1560879000.0,5194.05,20190618-conc,20190618,370,1,6,...,8,4492.0,4482.0,4512.0,True,4528.0,"(4554, 4538)",13824,True,756.266667
5910,2019/06/22 10:11:53,20190618-conc_LowZoom--W0000--P0001-T0370,1561191000.0,1560879000.0,5194.05,20190618-conc,20190618,370,1,7,...,7,5123.0,5113.0,,,5146.0,,15060,False,822.583333


In [49]:
df.columns

Index(['Time', 'LowZoom_ID', 'Timestamp', 'Experiment_StartTime_mins',
       'Experiment_Time_mins', 'Subexperiment', 'Experiment', 'Timepoint',
       'Position', 'Track_ID', 'Source_ID', 'Splitting_event', 'Lineage',
       'Track_Coordinate_X', 'Track_Coordinate_Y', 'Frame', 'Dataset',
       'Generation', 'Mother_ID', 'Grandmother_ID', 'Sister_ID', 'Seen_sister',
       'Aunt_ID', 'Cousin_ID', 'Random_ID', 'Seen_granny', 'Cell_Cycle_mins'],
      dtype='object')

In [50]:
# Rename to have consistent nomenclature of columns

df = df.rename(columns = {"Subexperiment": "Experiment", "Experiment": "Dataset"})

In [51]:
# Merge dataframe with morphometrics dataframe

#SOMETHING DOESN'T WORK, THERE ARE CELL CYCLES WITHOUT LINEAGE WHICH IS IMPOSSIBLE
#ALSO, reduced number of i.e. CV is because that's the one which have a lineage

sub_df = df[['Time', 'LowZoom_ID', 'Experiment_Time_mins', 'Experiment', 'Dataset',
       'Timepoint', 'Position', 'Track_ID', 'Source_ID', 'Splitting_event',
       'Lineage', 'Track_Coordinate_X', 'Track_Coordinate_Y', 'Frame',
       'Dataset', 'Generation', 'Mother_ID', 'Grandmother_ID', 'Sister_ID',
       'Seen_sister', 'Aunt_ID', 'Cousin_ID', 'Random_ID', 'Seen_granny']] ######## TEST Source_ID and Dataset and Position or so
matches_df = pd.read_csv(root + "MasterDataFrame_MatchesLineages.csv")
matches_df.columns

Index(['Unnamed: 0', 'Track_ID', 'Source_ID', 'Splitting_event', 'Lineage',
       'Track_Coordinate_X', 'Track_Coordinate_Y', 'Frame', 'Generation',
       'Mother_ID', 'Grandmother_ID', 'Sister_ID', 'Seen_sister', 'Aunt_ID',
       'Cousin_ID', 'Random_ID', 'Seen_granny', 'Delta_x', 'Delta_y',
       'Delta_t', 'Cell_ID', 'LowZoom_ID', 'Dataset', 'Experiment', 'Position',
       'Condition', 'Has_duplicate', 'X_px', 'Y_px', 'Timepoint', 'Time',
       'Differentiation_mins', 'Differentiation_bins', 'Experiment_Time_mins',
       'Chromatin_Dilation', 'Chromatin_Volume_um3', 'DNA_Volume_Threshold',
       'MetaphasePlate_Length_um', 'MetaphasePlate_Width_um',
       'Spindle_Angle_Degrees', 'Spindle_Aspect_Ratio',
       'Spindle_DNA_Volume_Ratio', 'Spindle_Length_um', 'Spindle_Volume_um3',
       'Spindle_Width_Avg_um', 'Tubulin_Spindle_Average_Intensity', 'Version',
       'Cell_Volume_um3', 'SurfaceArea', 'Sphericity', 'Tubulin_Cell_Average',
       'Tubulin_Cell_Minimum', 'Tubulin

In [52]:
matches_CC_df = pd.merge(sub_df, matches_df, how = "outer")
print(matches_CC_df.shape)
matches_CC_df.head()

MergeError: Data columns not unique: Index(['Time', 'LowZoom_ID', 'Experiment_Time_mins', 'Experiment', 'Dataset',
       'Dataset', 'Timepoint', 'Position', 'Track_ID', 'Source_ID',
       'Splitting_event', 'Lineage', 'Track_Coordinate_X',
       'Track_Coordinate_Y', 'Frame', 'Dataset', 'Dataset', 'Generation',
       'Mother_ID', 'Grandmother_ID', 'Sister_ID', 'Seen_sister', 'Aunt_ID',
       'Cousin_ID', 'Random_ID', 'Seen_granny'],
      dtype='object')

In [None]:
matches_CC_df.to_csv(root + "MasterDataFrame_MatchesLineagesWithCellCycles.csv")
print("Saved the BIG FINAL DATAFRAME.")

In [None]:
lineages_df.to_csv(root + "MasterDataFrame_Lineages.csv")
print("Finished compilation of concatenated lineages dataframe: {}".format(root + "MasterDataFrame_Lineages.csv"))