This file generates a csv and markdown table for the features of the dataset.

In [1]:
import pathlib

import pandas as pd
import toml

In [2]:
# set the path to the data
path = pathlib.Path(
    "../../../1.Exploratory_Data_Analysis/results/PBMC_combined.parquet"
)

# load in the data
df = pd.read_parquet(path)

In [3]:
# path to the ground truth
ground_truth_path = pathlib.Path(
    "../../../4.sc_Morphology_Neural_Network_MLP_Model/MLP_utils/ground_truth.toml"
)
# load in the ground truth
ground_truth = toml.load(ground_truth_path)
apoptosis_ground_truth = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_ground_truth = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
control_ground_truth = ground_truth["Healthy"]["healthy_groups_list"]

In [4]:
# change the p-adj into absolute values
df["p-adj"] = df["p-adj"].abs()
df.head()
# select row that have p-adj < 0.05
df = df[df["p-adj"] < 0.05]

In [5]:
# add the group1 and group2 columns into 1 column
df["group"] = df["group1"] + "_" + df["group2"]

In [6]:
# set theory for apoptosis control and pyroptosis
apoptosis_vs_healthy = df[df["group"] == "apoptosis_healthy"]
pyroptosis_vs_healthy = df[df["group"] == "healthy_pyroptosis"]
pyroptosis_vs_apoptosis = df[df["group"] == "apoptosis_pyroptosis"]

# get thee list of genes that are significant from each comparision
# define the sets
A = set(apoptosis_vs_healthy["features"].tolist())  # apoptosis_vs_healthy_list
B = set(pyroptosis_vs_apoptosis["features"].tolist())  # pyroptosis_vs_apoptosis_list
C = set(pyroptosis_vs_healthy["features"].tolist())  # pyroptosis_vs_healthy_list

In [7]:
# get the the intersections and union of the genes
U = set(df["features"].tolist())

# get the union of the genes
# Apoptosis vs control
A_int_b_un = U.difference(B.union(C))
# Pyroptosis vs apoptosis
B_int_c_un = U.difference(A.union(C))
# Pyroptosis vs control
C_int_a_un = U.difference(A.union(B))

# get the inersection of each of the groups


print(len(A_int_b_un), len(B_int_c_un), len(C_int_a_un))

367 98 67


In [8]:
# get the features that are in A_int_b_un
A_int_b_un_df = df[df["features"].isin(A_int_b_un)]
B_int_c_un_df = df[df["features"].isin(B_int_c_un)]
C_int_a_un_df = df[df["features"].isin(C_int_a_un)]

print(A_int_b_un_df.shape, B_int_c_un_df.shape, C_int_a_un_df.shape)
# concat all the dataframes
all_selected_features_df = pd.concat([A_int_b_un_df, B_int_c_un_df, C_int_a_un_df])
all_selected_features_df.head()

(367, 11) (98, 11) (67, 11)


Unnamed: 0,group1,group2,meandiff,lower,upper,p-adj,reject,features,p-adj_abs,pos_neg,group
0,apoptosis,healthy,-0.0016,0.9488,-0.0201,0.0169,False,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0169,positive,apoptosis_healthy
0,apoptosis,healthy,-0.019,0.0066,-0.0414,0.0034,False,Cells_Intensity_MassDisplacement_CorrMito,0.0034,positive,apoptosis_healthy
0,apoptosis,healthy,-0.0286,0.0,-0.0456,0.0116,True,Cytoplasm_Texture_DifferenceVariance_CorrER_3_...,0.0116,negative,apoptosis_healthy
0,apoptosis,healthy,-0.0209,0.0,-0.0379,0.004,True,Nuclei_Texture_AngularSecondMoment_CorrER_3_03...,0.004,negative,apoptosis_healthy
0,apoptosis,healthy,-0.0063,0.0631,-0.0164,0.0037,False,Cytoplasm_Intensity_MinIntensity_CorrMito,0.0037,positive,apoptosis_healthy


In [9]:
# drop columns from the df
all_selected_features_df = all_selected_features_df.drop(
    columns=[
        "group1",
        "group2",
        "meandiff",
        "lower",
        "upper",
        "reject",
        "p-adj",
        "pos_neg",
    ]
)
all_selected_features_df.rename(columns={"p-adj_abs": "p.adj.value"}, inplace=True)
print(all_selected_features_df.shape)
all_selected_features_df.head()

(532, 3)


Unnamed: 0,features,p.adj.value,group
0,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0169,apoptosis_healthy
0,Cells_Intensity_MassDisplacement_CorrMito,0.0034,apoptosis_healthy
0,Cytoplasm_Texture_DifferenceVariance_CorrER_3_...,0.0116,apoptosis_healthy
0,Nuclei_Texture_AngularSecondMoment_CorrER_3_03...,0.004,apoptosis_healthy
0,Cytoplasm_Intensity_MinIntensity_CorrMito,0.0037,apoptosis_healthy


In [10]:
# remove rows from df that are in all_selected_features_df
print(df.shape)
df = df[~df["features"].isin(all_selected_features_df["features"].tolist())]
df.loc[:, "Distinct"] = False
df.rename(columns={"p-adj_abs": "p.adj.value"}, inplace=True)

# remove the columns that are not needed
df = df.drop(
    columns=[
        "group1",
        "group2",
        "meandiff",
        "lower",
        "upper",
        "reject",
        "p-adj",
        "pos_neg",
    ]
)
print(df.shape)
df.head()

(1924, 11)
(1392, 4)


Unnamed: 0,features,p.adj.value,group,Distinct
0,Nuclei_RadialDistribution_ZernikeMagnitude_Cor...,0.0323,apoptosis_healthy,False
1,Nuclei_RadialDistribution_ZernikeMagnitude_Cor...,0.0199,apoptosis_pyroptosis,False
0,Cytoplasm_Texture_InfoMeas1_CorrPM_3_03_256,0.0137,apoptosis_healthy,False
1,Cytoplasm_Texture_InfoMeas1_CorrPM_3_03_256,0.0024,apoptosis_pyroptosis,False
2,Cytoplasm_Texture_InfoMeas1_CorrPM_3_03_256,0.0083,healthy_pyroptosis,False


In [11]:
all_selected_features_df.loc[:, "Distinct"] = True
all_selected_features_df.head()

Unnamed: 0,features,p.adj.value,group,Distinct
0,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0169,apoptosis_healthy,True
0,Cells_Intensity_MassDisplacement_CorrMito,0.0034,apoptosis_healthy,True
0,Cytoplasm_Texture_DifferenceVariance_CorrER_3_...,0.0116,apoptosis_healthy,True
0,Nuclei_Texture_AngularSecondMoment_CorrER_3_03...,0.004,apoptosis_healthy,True
0,Cytoplasm_Intensity_MinIntensity_CorrMito,0.0037,apoptosis_healthy,True


In [12]:
# concat all_selected_features_df and df
final_df = pd.concat([all_selected_features_df, df])
final_df.reset_index(drop=True, inplace=True)
print(final_df.shape)
final_df.head()

(1924, 4)


Unnamed: 0,features,p.adj.value,group,Distinct
0,Cells_RadialDistribution_ZernikeMagnitude_Corr...,0.0169,apoptosis_healthy,True
1,Cells_Intensity_MassDisplacement_CorrMito,0.0034,apoptosis_healthy,True
2,Cytoplasm_Texture_DifferenceVariance_CorrER_3_...,0.0116,apoptosis_healthy,True
3,Nuclei_Texture_AngularSecondMoment_CorrER_3_03...,0.004,apoptosis_healthy,True
4,Cytoplasm_Intensity_MinIntensity_CorrMito,0.0037,apoptosis_healthy,True


In [13]:
# set the output file path
output_file_path = pathlib.Path("../results/")
output_file_path.mkdir(exist_ok=True, parents=True)

In [14]:
# print the table to a csv
final_df.to_csv("../results/all_features.csv", index=False)

In [15]:
final_df.to_markdown(
    "../results/all_features.md",
    index=False,
    tablefmt="github",
    floatfmt=".2f",
    showindex=False,
    numalign="center",
    stralign="center",
)

  final_df.to_markdown("../results/all_features.md", index=False, tablefmt="github", floatfmt=".2f", showindex=False, numalign="center", stralign="center")
