In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

datadir = Path('/mnt/med-groups-lmu/ls1/users/m/mushtaq/cellIQ 30.06.2020 20ulseeding a2WT a3TRI b2Afa b3Magi c2Occ c3LSR/output')
#datadir = Path('/home/hajaalin/tmp/toiba/output')

# create folder for debugging output
debugdir = datadir / 'debug'
debugdir.mkdir( exist_ok=True)

wells = datadir.glob('Well*')
wells = sorted(wells)

# this will contain all data
df = pd.DataFrame()

# images from same well within this timedelta are considered to belong to the same timepoint
max_cycle_timedelta_within_well = np.timedelta64(2,'m')


for w in wells:
    print(w)
    files = (w / "ar").glob("*.csv")
    files = sorted(files)
    
    # extra index for cycles (can be useful unless imaging is stopped and restarted)
    cycle = 1
    
    for f in files:
        #print(f)
        tmp = pd.read_csv(f)
        
        # add a column that shows which well the numbers are from
        tmp["Well_Site"] = w.stem
        tmp[["Well","Site"]] = tmp["Well_Site"].str.extract('(?P<Well>.*)(?P<Site>_[0-9]{1,2}$)', expand=True)
        
        # add a column that shows the timepoint
        timestr = f.stem.replace("_segmentation_results","")
        timestr = timestr.split('_')
        timestr = "_".join(timestr[:5])
        tmp['Timepoint'] = pd.to_datetime(timestr, format='%Y_%m_%d_%H_%M')
        
        # add column for cycle
        tmp['Cycle'] = cycle
        
        # add to the main dataframe
        df = df.append(tmp, ignore_index=True)
        
        cycle = cycle + 1
        #break
    #break
    
print('Done.')



In [None]:
# add a column that show time from start of experiment
start_time = df['Timepoint'].min()
df['T'] = df['Timepoint'] - start_time

# drop unnecessary columns
df = df.drop(columns=['Area', 'X', 'Y', 'XM', 'YM', 'Major', 'Minor', 'Angle', 'Circ.', 'Round', 'Solidity'])

df.head()

In [None]:
# write data to one .csv
df.to_csv(debugdir / 'all_measurements_in_one_file_before_fixes.csv', index=False)


In [None]:
# Calculate mean AR for each combination of well, cycle and timepoint.
# The aim is to find cases where one cycle contains multiple timepoints.
wct = df[['Well','Cycle','T','AR']].groupby(['Well','Cycle','T']).mean()
wct.reset_index(inplace=True)

# save the data for checking in excel
wct.to_csv(debugdir / 'wct.csv')

# Find cases where there are multiple entries for the same cycle, 
# i.e. when all fields were not imaged within the same minute.

#             keep
# value False first last    False^last
# A       0     0     0          0
# B       1     0     1          0
# B       1     1     1          0
# B       1     1     0          1
# C       0     0     0          0
duplicated_all = wct.duplicated(subset=['Well','Cycle'], keep=False)
duplicated_keep_last = wct.duplicated(subset=['Well','Cycle'], keep='last')
duplicated_last = duplicated_all ^ duplicated_keep_last

wct[duplicated_all].to_csv(debugdir /  'duplicated_all.csv')
wct[duplicated_last].to_csv(debugdir / 'duplicated_last.csv')

# consider that the last of the duplicated lines has the correct timestamp
fix = wct[duplicated_last]

for i in fix.index:
    w = fix.loc[i]['Well']
    c = fix.loc[i]['Cycle']
    t = fix.loc[i]['T']
    #print( w + ' ' + str(c) + ' ' + str(t))
    well_cycle = df[(df['Well'] == w) & (df['Cycle']==c)]
    idx = well_cycle.index
    
    # check how many timepoints there are for this cycle, alert if > 2
    if well_cycle['Timepoint'].unique().size > 2:
        print('WARNING: more than 2 timepoints per cycle: ' + str(w) + ', cycle ' + str(c))
        well_cycle.to_csv(debugdir / (str(w) + '_cycle' + str(c) + '.csv'))
    
    # check how far apart the timepoints are, alert if > limit
    well_cycle_duration = well_cycle['Timepoint'].max() - well_cycle['Timepoint'].min()
    if well_cycle_duration > max_cycle_timedelta_within_well:
        print('WARNING: too long well cycle time: ' + str(w) + ', cycle ' + str(c))
        well_cycle.to_csv(debugdir / (str(w) + '_cycle' + str(c) + '.csv'))
    
    # set the same T to all entries in this well/cycle combination
    df.at[idx, 'T'] = t


In [None]:
# write data to one .csv
df.to_csv(datadir / 'all_measurements_in_one_file_after_fixes.csv', index=False)


In [None]:
# combine the AR data in another dataframe, take mean over timepoint
mean_ar = pd.pivot_table(df, index='T', columns='Well', values='AR', aggfunc='mean')

# After the above pivot operation, T is the index of the dataframe.
# reset_index() will make T a normal column.

# write mean AR data in one .csv
mean_ar.reset_index().to_csv(datadir / 'mean_ar_all_in_one_file.csv', index=False)

mean_ar.reset_index().head()

In [None]:
import matplotlib.pyplot as plt

# prepare to plot data
fig,ax = plt.subplots(figsize=(25,15))

# loop over wells
for c in mean_ar.columns:
    #print(c)
    
    # The NaN values on mean_ar cause problems for plotting.
    # Workaround is to... create a new dataframe, keep
    keep = [c]
    # ...make copy of the data
    tmp = mean_ar.copy()
    # ...keep only the interesting column (well)
    tmp = tmp.drop(tmp.columns.difference(keep), axis=1) 
    # ...drop NaN values
    tmp = tmp.dropna()

    # plot the data
    tmp.plot(ax=ax)



# y-axis range for figure
MEAN_AR_MIN = 1
MEAN_AR_MAX = 2
plt.ylim(MEAN_AR_MIN, MEAN_AR_MAX)

# set y axis label
plt.ylabel('Mean AR')

plt.savefig(datadir / 'mean_ar.png')

#tmp.head()