<h2> Save measures that characterise distribution of cell features for each cell type </h2>

<h4> Input: </h4> csv file with cell features for all cells.
<p> <h4> Output: </h4> xlsx file with median, mean, std of cell features per cell type </h3>

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import ipyparallel as ipp

Define which group of measurments you want to analyse: controls, patients

In [2]:
Group = "patients"
Folder = Path(r"D:\Other Backup\Myh9 project\raw data csv")
Data_folder = Folder / Group
fnames = os.listdir(Data_folder)
# print(fnames)

In [3]:
def take_info_from_data_frames(Folder, fname):

    import pandas as pd
    import numpy as np
    import pathlib
    import warnings
    import dclab
    from scipy.stats import skew

    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

    # this a PerformanceWarning I am silecing:
    # PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  
    # Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
    # df_summary["Inertia Ratio " + cell_type + " median"] = df["convex inertia ratio"][cells].median()
    sample = fname.split(".")[0]
    cell_type = sample.split("_")[-1]
    df_summary = pd.DataFrame(index = [cell_type])

    def add_features_to_summary(df_cell_type, df, cell_type, sample):
        df["file name"] = sample
        df["cell type"] = cell_type
        df["cell count"] = df_cell_type.count()[1]
        
        if "low porosity" in cell_type:
            cell_name = cell_type.split(" ")[-1]
        else:
            cell_name = cell_type

        df["Area median"] = df_cell_type["Area " + cell_name].median()
        df["Area mean"] = df_cell_type["Area " + cell_name].mean()
        df["Area std"] = df_cell_type["Area " + cell_name].std()

        df["Deformation median"] = df_cell_type["Deformation " + cell_name].median()
        df["Deformation mean"] = df_cell_type["Deformation " + cell_name].mean()
        df["Deformation std"] = df_cell_type["Deformation " + cell_name].std()

        df["Inertia ratio median"] = df_cell_type["Inertia ratio " + cell_name].median()
        df["Inertia ratio mean"] = df_cell_type["Inertia ratio " + cell_name].mean()
        df["Inertia ratio std"] = df_cell_type["Inertia ratio " + cell_name].std()

        df["Porosity median"] = df_cell_type["Porosity " + cell_name].median()
        df["Porosity mean"] = df_cell_type["Porosity " + cell_name].mean()
        df["Porosity std"] = df_cell_type["Porosity " + cell_name].std()

        return df    
    
    # read data and summarize results
    print(fname)
    df_cell_type = pd.read_csv(Folder / fname)
    add_features_to_summary(df_cell_type, df_summary, cell_type, sample)

    if cell_type == "Neutrophils":
        df_cell_type = df_cell_type[df_cell_type["Porosity " + cell_type] < 1.05].copy()
        cell_type = "low porosity Neutrophils"
        df_summary2 = pd.DataFrame(index = [cell_type])
        add_features_to_summary(df_cell_type, df_summary2, cell_type, sample)
        df_summary = pd.concat([df_summary, df_summary2])

    if cell_type == "Lymphocytes":
        df_cell_type = df_cell_type[df_cell_type["Porosity " + cell_type] < 1.05].copy()
        cell_type = "low porosity Lymphocytes"
        df_summary2 = pd.DataFrame(index = [cell_type])
        add_features_to_summary(df_cell_type, df_summary2, cell_type, sample)
        df_summary = pd.concat([df_summary, df_summary2])

    return df_summary


In [4]:
print(Data_folder)
os.chdir(Data_folder)

D:\Other Backup\Myh9 project\raw data csv\patients


In [5]:
N = len(fnames)
with ipp.Cluster(n=3) as rc:
    view = rc.load_balanced_view()
    asyncresult = view.map_async(take_info_from_data_frames, N*[Data_folder],
                                 fnames)
    asyncresult.wait_interactive()
    result_mp = asyncresult.get()

df_mp = result_mp[0].copy()
for res in result_mp[1:]:
    df_mp = pd.concat([df_mp, res])

Starting 3 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>


  0%|          | 0/3 [00:00<?, ?engine/s]

take_info_from_data_frames:   0%|          | 0/90 [00:00<?, ?tasks/s]

Stopping engine(s): 1749107425
engine set stopped 1749107425: {'engines': {'0': {'exit_code': 1, 'pid': 14912, 'identifier': '0'}, '1': {'exit_code': 1, 'pid': 21384, 'identifier': '1'}, '2': {'exit_code': 1, 'pid': 9148, 'identifier': '2'}}, 'exit_code': 1}
Stopping controller
Controller stopped: {'exit_code': 1, 'pid': 7764, 'identifier': 'ipcontroller-1749107423-w8s6-5040'}


In [6]:
df_mp.head(n=5)

Unnamed: 0,file name,cell type,cell count,Area median,Area mean,Area std,Deformation median,Deformation mean,Deformation std,Inertia ratio median,Inertia ratio mean,Inertia ratio std,Porosity median,Porosity mean,Porosity std
Erythrocytes,20240212_I2_1st_Erythrocytes,Erythrocytes,1299664,32.580541,32.998393,3.842383,0.333964,0.333185,0.024456,3.781276,3.796237,0.345796,1.082746,1.085155,0.020066
Lymphocytes,20240212_I2_1st_Lymphocytes,Lymphocytes,589,39.075001,39.138126,4.41157,0.082155,0.083441,0.015651,1.348592,1.351655,0.121083,1.031669,1.031782,0.007383
low porosity Lymphocytes,20240212_I2_1st_Lymphocytes,low porosity Lymphocytes,579,38.912639,39.088274,4.393319,0.082071,0.082847,0.014972,1.347337,1.351276,0.120696,1.031484,1.031413,0.006884
Neutrophils,20240212_I2_1st_Neutrophils,Neutrophils,534,64.322214,64.656768,5.418417,0.117821,0.128444,0.033611,1.70031,1.717356,0.145404,1.032245,1.037894,0.017746
low porosity Neutrophils,20240212_I2_1st_Neutrophils,low porosity Neutrophils,447,63.80807,64.088963,5.212401,0.114329,0.116395,0.016768,1.681367,1.694207,0.120854,1.030859,1.031311,0.006436


In [7]:
file_name = Group + "_summary_statistics.xlsx"
df_mp.to_excel(Folder / "summary statistics" / file_name)

In [8]:
file_name = Group + "_summary_statistics.csv"
df_mp.to_csv(Folder / file_name)