# Stratify to perform linear modeling on certain data

In [1]:
import pathlib
import pandas as pd

from sklearn.linear_model import LinearRegression

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features


In [2]:
# Define inputs and outputs
plate = "localhost231120090001"  # Focusing on plate 4
file_suffix = "_sc_feature_selected.parquet"

data_dir = pathlib.Path("../../../3.process_cfret_features/data/single_cell_profiles")

data_df = pd.read_parquet(pathlib.Path(data_dir, f"{plate}{file_suffix}"))

output_dir = pathlib.Path("results")
output_cp_file = pathlib.Path(output_dir, f"{plate}_linear_model_failing_healthy_no_treatment.tsv")

# Replace NA values with "None"
data_df['Metadata_treatment'].fillna('None', inplace=True)

# Add cell count per well as a column
data_df['Metadata_Cell_Count'] = data_df.groupby('Metadata_Well')['Metadata_Well'].transform('count')

print(data_df.shape)
data_df.head()


(21370, 712)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,...,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_02_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256,Metadata_Cell_Count
0,B,2,2,Healthy,,,520.849209,277.58342,20,1.0,...,1.335463,0.669517,0.575565,1.525251,1.399406,-0.365606,-0.348466,-0.248804,-0.433848,351
1,B,2,2,Healthy,,,313.66111,374.449986,20,1.0,...,0.812481,0.872447,1.10794,1.489412,1.584068,-0.344077,-0.336442,-0.249026,-0.427261,351
2,B,2,2,Healthy,,,709.496383,379.652932,20,1.0,...,2.191081,1.274566,1.314952,1.501501,1.441772,-0.373983,-0.378128,-0.253587,-0.432056,351
3,B,2,2,Healthy,,,869.851378,511.154606,20,1.0,...,0.481567,1.164635,1.077324,0.951044,0.636573,-0.322589,-0.336657,-0.243788,-0.360678,351
4,B,2,2,Healthy,,,588.967372,503.936707,20,1.0,...,0.476893,1.064612,1.041011,1.255353,1.040231,-0.37562,-0.310051,-0.251854,-0.412598,351


## Stratify data

In [3]:
# Filter by cell type and only cells without DMSO treatment
specific_type = ["None"]
specific_cell_types = ["Failing", "Healthy"]

filtered_df = data_df[
    (data_df['Metadata_treatment'].isin(specific_type)) &
    (data_df['Metadata_cell_type'].isin(specific_cell_types))
]

# Drop NA columns
cp_df = feature_select(
    filtered_df,
    operation="drop_na_columns",
    na_cutoff=0
)

# Count number of cells per well and add to data frame as metadata
cell_count_df = pd.DataFrame(
    cp_df.groupby("Metadata_Well").count()["Metadata_treatment"]
).reset_index()
cell_count_df.columns = ["Metadata_Well", "Metadata_cell_count_per_well"]
cp_df = cell_count_df.merge(cp_df, on=["Metadata_Well"])

# Define CellProfiler features
cp_features = infer_cp_features(cp_df)

print(f"We are testing {len(cp_features)} CellProfiler features")
print(cp_df.shape)
cp_df.head()


We are testing 661 CellProfiler features
(19976, 680)


Unnamed: 0,Metadata_Well,Metadata_cell_count_per_well,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_Hoechst_3_02_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256,Metadata_Cell_Count
0,B02,351,B,2,2,Healthy,,,520.849209,277.58342,...,1.335463,0.669517,0.575565,1.525251,1.399406,-0.365606,-0.348466,-0.248804,-0.433848,351
1,B02,351,B,2,2,Healthy,,,313.66111,374.449986,...,0.812481,0.872447,1.10794,1.489412,1.584068,-0.344077,-0.336442,-0.249026,-0.427261,351
2,B02,351,B,2,2,Healthy,,,709.496383,379.652932,...,2.191081,1.274566,1.314952,1.501501,1.441772,-0.373983,-0.378128,-0.253587,-0.432056,351
3,B02,351,B,2,2,Healthy,,,869.851378,511.154606,...,0.481567,1.164635,1.077324,0.951044,0.636573,-0.322589,-0.336657,-0.243788,-0.360678,351
4,B02,351,B,2,2,Healthy,,,588.967372,503.936707,...,0.476893,1.064612,1.041011,1.255353,1.040231,-0.37562,-0.310051,-0.251854,-0.412598,351


## Fit linear model

In [4]:
# Setup linear modeling framework -> in plate 4 we are looking at the treatments or cell type
variables = ["Metadata_cell_count_per_well", "Metadata_cell_type"]
X = cp_df.loc[:, variables]

print(X.shape)
X.head()


(19976, 2)


Unnamed: 0,Metadata_cell_count_per_well,Metadata_cell_type
0,351,Healthy
1,351,Healthy
2,351,Healthy
3,351,Healthy
4,351,Healthy


In [5]:
# Set the variables and treatments used for LM
variables = ["Metadata_cell_count_per_well", "Metadata_cell_type"]
treatments_to_select = ["Failing", "Healthy"]

# Select rows with specific treatment values
selected_rows = X[X["Metadata_cell_type"].isin(treatments_to_select)]

# Create dummy variables
dummies = pd.get_dummies(selected_rows["Metadata_cell_type"])

# Concatenate dummies with the selected rows DataFrame
X = pd.concat([selected_rows, dummies], axis=1)

# Drop the original treatment column
X.drop("Metadata_cell_type", axis=1, inplace=True)

print(X.shape)
X.head()


(19976, 3)


Unnamed: 0,Metadata_cell_count_per_well,Failing,Healthy
0,351,0,1
1,351,0,1
2,351,0,1
3,351,0,1
4,351,0,1


In [6]:
# Fit linear model for each feature
lm_results = []
for cp_feature in cp_features:
    # Create a boolean mask to filter rows with the specified treatments
    mask = cp_df["Metadata_cell_type"].isin(treatments_to_select)

    # Apply the mask to Subset CP data to each individual feature (univariate test)
    cp_subset_df = cp_df.loc[mask, cp_feature]

    # Fit linear model
    lm = LinearRegression()
    lm_result = lm.fit(X=X, y=cp_subset_df)
    
    # Extract Beta coefficients
    # (contribution of feature to X covariates)
    coef = lm_result.coef_
    
    # Estimate fit (R^2)
    r2_score = lm.score(X=X, y=cp_subset_df)
    
    # Add results to a growing list
    lm_results.append([cp_feature, r2_score] + list(coef))

# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
    lm_results,
    columns=["feature", "r2_score", "cell_count_coef", "failing_coef", "healthy_coef"]
)

# Output file
lm_results.to_csv(output_cp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()


(661, 5)


Unnamed: 0,feature,r2_score,cell_count_coef,failing_coef,healthy_coef
0,Cytoplasm_AreaShape_Compactness,0.013426,0.00155,0.032122,-0.032122
1,Cytoplasm_AreaShape_Extent,0.009293,-0.000956,0.069869,-0.069869
2,Cytoplasm_AreaShape_FormFactor,0.016703,-0.001796,0.019894,-0.019894
3,Cytoplasm_AreaShape_MajorAxisLength,0.026743,-0.001641,0.069342,-0.069342
4,Cytoplasm_AreaShape_MinorAxisLength,0.033805,-0.001911,0.138733,-0.138733
