In [1]:
import sys
import pathlib
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
import plotly.express as px
from pycytominer.cyto_utils import infer_cp_features
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pdfkit
import pyarrow as pa
import pyarrow.parquet as pq

# import Union
from typing import Union

sys.path.append("..")
# from ..utils.utils import df_stats
import matplotlib.pyplot as plt

pd.set_option("mode.chained_assignment", None)

In [2]:
# Define inputs
feature_file = pathlib.Path(
    "../../Extracted_Features_(CSV_files)/feature_df_sc_norm.parquet"
)
feature_df = pq.read_table(feature_file).to_pandas()

In [3]:
# define output file path
one_beta_output_file_path = pathlib.Path("./results/lm_one_beta.tsv")
two_beta_output_file_path = pathlib.Path("./results/lm_two_beta.tsv")
three_beta_output_file_path = pathlib.Path("./results/lm_three_beta.tsv")
four_beta_output_file_path = pathlib.Path("./results/lm_four_beta.tsv")

In [4]:
cp_features = infer_cp_features(feature_df)
print(f"We are testing {len(cp_features)} CellProfiler features")

new_line = "\n"
print(
    f"The unique Treatment-Dosages are: {f', {new_line}'.join((feature_df['oneb_Metadata_Treatment_Dose_Inhibitor_Dose'].unique()))}"
)

We are testing 2847 CellProfiler features
The unique Treatment-Dosages are: media ctr_0_Media ctr_0.0, 
DMSO_0.100_DMSO_1.0, 
DMSO_0.100_Z-VAD-FMK_100.0, 
DMSO_0.100_Z-VAD-FMK_30.0, 
DMSO_0.100_DMSO_0.025, 
Thapsigargin_1.000_DMSO_0.025, 
Thapsigargin_10.000_DMSO_0.025, 
Topotecan_5.000_DMSO_0.025, 
Topotecan_10.000_DMSO_0.025, 
Topotecan_20.000_DMSO_0.025, 
LPS_0.010_DMSO_0.025, 
LPS_0.100_DMSO_0.025, 
LPS_1.000_DMSO_0.025, 
LPS_10.000_DMSO_0.025, 
LPS_10.000_Disulfiram_0.1, 
LPS_10.000_Disulfiram_1.0, 
LPS_10.000_Disulfiram_2.5, 
LPS_Nigericin_100.000_1.0_DMSO_0.025, 
LPS_Nigericin_100.000_3.0_DMSO_0.025, 
LPS_Nigericin_100.000_10.0_DMSO_0.025, 
Disulfiram_0.100_DMSO_0.025, 
Disulfiram_1.000_DMSO_0.025, 
Disulfiram_2.500_DMSO_0.025, 
H2O2_100.000_DMSO_0.025, 
LPS_10.000_Z-VAD-FMK_100.0, 
LPS_100.000_DMSO_0.025, 
LPS_Nigericin_1.000_1.0_DMSO_0.025, 
LPS_Nigericin_1.000_3.0_DMSO_0.025, 
LPS_Nigericin_1.000_10.0_DMSO_0.025, 
LPS_Nigericin_1.000_10.0_Disulfiram_1.0, 
LPS_Nigericin_1.000_

#### Complex Linear Modeling (cell count beta + 4 beta approach)
Here I run the same analysis as above but with dosage of a treatment being a factor in the linear model. All features and treatments will be exported into 1 file.

Linear Model:  
$y = \beta _{0}x+ \beta _{1}x+ \beta _{2}x+ \beta _{3}x+ \beta _{4}x+ \epsilon$ where;  
$y$ is each feature    
$x$ is the inputed variables  
$\beta _{0}$ is the beta coefficient attributed to cell count.    
$\beta _{1}$ is the beta coefficient attributed to Inducer.   
$\beta _{2}$ is the beta coefficient attributed to Inducer dose.    
$\beta _{3}$ is the beta coefficient attributed to Inhibitor.    
$\beta _{4}$ is the beta coefficient attributed to Inhibitor Dose.   
$\epsilon$ is the residual variance not explained by factors in the model  

In [5]:
# Loop for each treatment then each feature

# define the control and treatment
# Setup linear modeling framework
model_covariates = ["Metadata_number_of_singlecells"]
control = "DMSO__0.100__DMSO__0.025"
lm_results = []
for treatment in feature_df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique():
    dosage_treatments_list = [treatment, control]
    print(dosage_treatments_list)
    df = feature_df.query(
        "fourb_Metadata_Treatment_Dose_Inhibitor_Dose in @dosage_treatments_list"
    )
    # Add dummy matrix of categorical genotypes
    df[
        [
            "fourb_Treatment",
            "fourb_Treatment_Dose",
            "fourb_Inhibitor",
            "fourb_Inhibitor_Dose",
        ]
    ] = df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"].str.split("__", expand=True)
    tmp_df = df.loc[
        :,
        (
            "fourb_Treatment",
            "fourb_Treatment_Dose",
            "fourb_Inhibitor",
            "fourb_Inhibitor_Dose",
        ),
    ]

    tmp_df["fourb_Treatment"] = LabelEncoder().fit_transform(tmp_df["fourb_Treatment"])
    tmp_df["fourb_Treatment_Dose"] = LabelEncoder().fit_transform(
        tmp_df["fourb_Treatment_Dose"]
    )
    tmp_df["fourb_Inhibitor"] = LabelEncoder().fit_transform(tmp_df["fourb_Inhibitor"])
    tmp_df["fourb_Inhibitor_Dose"] = LabelEncoder().fit_transform(
        tmp_df["fourb_Inhibitor_Dose"]
    )

    X = pd.concat([df.loc[:, model_covariates], tmp_df], axis=1)
    columns_list = (
        ["feature", "r2_score"]
        + X.columns.tolist()
        + [
            "inducer1__inducer1_dose__inhibitor__inhibitor_dose",
        ]
    )

    # Fit linear model for each feature
    # lm_results = []
    for cp_feature in cp_features:
        # Subset CP data to each individual feature (univariate test)
        cp_subset_df = df.loc[:, cp_feature]

        # Fit linear model
        lm = LinearRegression(fit_intercept=True)
        lm_result = lm.fit(X=X, y=cp_subset_df)

        # Extract Beta coefficients
        # (contribution of feature to X covariates)
        coef = list(lm_result.coef_)
        # Estimate fit (R^2)
        r2_score = lm.score(X=X, y=cp_subset_df)

        # Add results to a growing list
        lm_results.append(
            [cp_feature, r2_score]
            + coef
            + [
                treatment,
            ]
        )

# Convert results to a pandas DataFrame
lm_results_df = pd.DataFrame(lm_results, columns=columns_list)

# write output to file
lm_results_df.to_csv(four_beta_output_file_path, sep="\t", index=False)

['media ctr__0__Media ctr__0.0', 'DMSO__0.100__DMSO__0.025']
['DMSO__0.100__DMSO__1.0', 'DMSO__0.100__DMSO__0.025']
['DMSO__0.100__Z-VAD-FMK__100.0', 'DMSO__0.100__DMSO__0.025']
['DMSO__0.100__Z-VAD-FMK__30.0', 'DMSO__0.100__DMSO__0.025']
['DMSO__0.100__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['Thapsigargin__1.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['Thapsigargin__10.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['Topotecan__5.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['Topotecan__10.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['Topotecan__20.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['LPS__0.010__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['LPS__0.100__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['LPS__1.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['LPS__10.000__DMSO__0.025', 'DMSO__0.100__DMSO__0.025']
['LPS__10.000__Disulfiram__0.1', 'DMSO__0.100__DMSO__0.025']
['LPS__10.000__Disulfiram__1.0', 'DMSO__0.100__DMSO__0.025']
['LPS__10.000__Disulfiram__2.5', 'DMSO