# Stratify to perform linear modeling on certain data

In [1]:
import pathlib
import pandas as pd

from sklearn.linear_model import LinearRegression

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features


In [2]:
# Define inputs and outputs
plate = "localhost230405150001"  # Focusing on plate 3
file_suffix = "_sc_feature_selected.parquet"

data_dir = pathlib.Path("../../../3.process_cfret_features/data/single_cell_profiles")

data_df = pd.read_parquet(pathlib.Path(data_dir, f"{plate}{file_suffix}"))

output_dir = pathlib.Path("results")
output_cp_file = pathlib.Path(output_dir, f"{plate}_linear_model_DMSO_failing_healthy.tsv")

print(data_df.shape)
data_df.head()


(26992, 730)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,...,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,9,failing,rejected,DMSO,716.170091,177.132195,40,1.0,...,-0.069668,-0.411109,0.182782,-0.199326,0.44762,0.350265,-0.217487,-0.380608,2.405688,-0.190068
1,B,2,9,failing,rejected,DMSO,528.646623,196.955552,40,1.0,...,0.616922,-0.400131,0.260481,0.497841,0.714237,0.41196,-0.167521,-0.367868,-0.152203,-0.217675
2,B,2,9,failing,rejected,DMSO,341.521655,269.502036,40,1.0,...,0.136775,0.246017,1.150965,0.939213,0.77791,1.095054,0.48207,-0.37017,-0.263567,-0.278683
3,B,2,9,failing,rejected,DMSO,135.14192,323.069081,40,1.0,...,-0.01473,-0.365864,0.511223,-0.055412,-0.141132,-0.191688,-0.192098,0.026329,-0.195421,-0.095792
4,B,2,9,failing,rejected,DMSO,826.262206,346.991787,40,1.0,...,0.023438,-0.37484,0.711451,0.631388,1.373111,1.038347,-0.28294,-0.396422,-0.238851,-0.343199


## Stratify data

In [3]:
# Filter by failing hearts and specific treatments
specific_type = ["DMSO"]
specific_cell_types = ["failing", "healthy"]

filtered_df = data_df[
    (data_df['Metadata_treatment'].isin(specific_type)) &
    (data_df['Metadata_cell_type'].isin(specific_cell_types))
]

# Drop NA columns
cp_df = feature_select(
    filtered_df,
    operation="drop_na_columns",
    na_cutoff=0
)

# Count number of cells per well and add to dataframe as metadata
cell_count_df = pd.DataFrame(
    cp_df.groupby("Metadata_Well").count()["Metadata_treatment"]
).reset_index()
cell_count_df.columns = ["Metadata_Well", "Metadata_cell_count_per_well"]
cp_df = cell_count_df.merge(cp_df, on=["Metadata_Well"])

# Define CellProfiler features
cp_features = infer_cp_features(cp_df)

print(f"We are testing {len(cp_features)} CellProfiler features")
print(cp_df.shape)
cp_df.head()


We are testing 638 CellProfiler features
(6567, 656)


Unnamed: 0,Metadata_Well,Metadata_cell_count_per_well,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B03,583,B,3,9,failing,rejected,drug_x,667.439289,81.357994,...,0.495378,0.380264,0.350778,-0.706335,0.079535,-0.035623,-0.286867,-0.350705,-0.11194,-0.220838
1,B03,583,B,3,9,failing,rejected,drug_x,316.323671,138.76108,...,-0.25217,1.393157,0.942896,0.042958,1.429597,1.277577,-0.469904,-0.407625,-0.212999,-0.424042
2,B03,583,B,3,9,failing,rejected,drug_x,796.348792,202.364473,...,0.510913,1.224876,0.949496,0.816062,1.320878,0.597129,-0.45671,-0.396778,-0.260921,-0.323157
3,B03,583,B,3,9,failing,rejected,drug_x,988.128824,204.343529,...,1.314197,1.125805,1.496068,1.466107,1.075521,0.296065,-0.456253,-0.397567,-0.274179,-0.299051
4,B03,583,B,3,9,failing,rejected,drug_x,456.818771,257.604964,...,0.217932,0.793597,0.708867,0.687215,0.680026,0.803888,-0.374172,-0.409765,-0.253457,-0.376503


## Fit linear model

In [5]:
# Setup linear modeling framework -> in plate 3 we are looking at the treatments or cell type
variables = ["Metadata_cell_count_per_well", "Metadata_cell_type"]
X = cp_df.loc[:, variables]

print(X.shape)
X.head()


(6567, 2)


Unnamed: 0,Metadata_cell_count_per_well,Metadata_cell_type
0,583,failing
1,583,failing
2,583,failing
3,583,failing
4,583,failing


In [6]:
# Set the variables and treatments used for LM
variables = ["Metadata_cell_count_per_well", "Metadata_cell_type"]
treatments_to_select = ["failing", "healthy"]

# Select rows with specific treatment values
selected_rows = X[X["Metadata_cell_type"].isin(treatments_to_select)]

# Create dummy variables
dummies = pd.get_dummies(selected_rows["Metadata_cell_type"])

# Concatenate dummies with the selected rows DataFrame
X = pd.concat([selected_rows, dummies], axis=1)

# Drop the original treatment column
X.drop("Metadata_cell_type", axis=1, inplace=True)

print(X.shape)
X.head()


(6567, 3)


Unnamed: 0,Metadata_cell_count_per_well,failing,healthy
0,583,1,0
1,583,1,0
2,583,1,0
3,583,1,0
4,583,1,0


In [7]:
# Fit linear model for each feature
lm_results = []
for cp_feature in cp_features:
    # Create a boolean mask to filter rows with the specified treatments
    mask = cp_df["Metadata_cell_type"].isin(treatments_to_select)

    # Apply the mask to Subset CP data to each individual feature (univariate test)
    cp_subset_df = cp_df.loc[mask, cp_feature]

    # Fit linear model
    lm = LinearRegression()
    lm_result = lm.fit(X=X, y=cp_subset_df)
    
    # Extract Beta coefficients
    # (contribution of feature to X covariates)
    coef = lm_result.coef_
    
    # Estimate fit (R^2)
    r2_score = lm.score(X=X, y=cp_subset_df)
    
    # Add results to a growing list
    lm_results.append([cp_feature, r2_score] + list(coef))

# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
    lm_results,
    columns=["feature", "r2_score", "cell_count_coef", "failing_coef", "healthy_coef"]
)

# Output file
lm_results.to_csv(output_cp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()


(638, 5)


Unnamed: 0,feature,r2_score,cell_count_coef,failing_coef,healthy_coef
0,Cytoplasm_AreaShape_BoundingBoxArea,0.048118,-0.001164,0.253471,-0.253471
1,Cytoplasm_AreaShape_Compactness,0.054475,-0.000464,0.366596,-0.366596
2,Cytoplasm_AreaShape_Eccentricity,0.006924,-0.000342,0.122436,-0.122436
3,Cytoplasm_AreaShape_FormFactor,0.068582,0.000284,-0.312358,0.312358
4,Cytoplasm_AreaShape_MajorAxisLength,0.047932,-0.001129,0.271346,-0.271346
