## Process JUMP phenotypic profiles

We applied the AreaShape only class-balanced multiclass elastic net logistic regression model to all single-cell profiles in the JUMP dataset.

We then performed a series of KS tests to identify how different treatment distributions of all phenotype probabilities differed from controls.

See https://github.com/WayScience/JUMP-single-cell for complete details.

Here, we perform the following:

1. Load in this data from the JUMP-single-cell repo
2. Explore the top results per phenotype/treatment_type/model_type
3. Convert it to wide format

This wide format represents a "phenotypic profile" which we can use similarly as an image-based morphology profile.

In [1]:
import pathlib
import pandas as pd

In [2]:
commit = "2c063b6dc48049201a57b060d18f97a5fc783488"

url = "https://github.com/WayScience/JUMP-single-cell/raw"
file = "3.analyze_data/class_balanced_well_log_reg_comparison_results/class_balanced_well_log_reg_areashape_model_comparisons.parquet"

jump_sc_pred_file = f"{url}/{commit}/{file}"

n_top_results_to_explore = 10

In [3]:
# Set output files
output_dir = "jump_phenotype_profiles"

top_results_summary_file = pathlib.Path(output_dir, "jump_most_significant_phenotype_enrichment.tsv")
final_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles.tsv.gz")
shuffled_jump_phenotype_file = pathlib.Path(output_dir, "jump_phenotype_profiles_shuffled.tsv.gz")

In [4]:
# Load KS test results and drop uninformative columns
jump_pred_df = (
    pd.read_parquet(jump_sc_pred_file)
    .drop(columns=["statistical_test", "comparison_metric"])
)

print(jump_pred_df.shape)
jump_pred_df.head()

(484650, 9)


Unnamed: 0,comparison_metric_value,p_value,Metadata_Plate,treatment,Metadata_model_type,treatment_type,Metadata_Well,cell_count,phenotype
0,0.091654,0.01313,BR00117002,ABL1,final,crispr,C01,592,ADCCM
1,0.118823,0.000441,BR00117002,ABL1,final,crispr,C01,592,Anaphase
2,0.121319,0.000273,BR00117002,ABL1,final,crispr,C01,592,Apoptosis
3,0.054403,0.332314,BR00117002,ABL1,final,crispr,C01,592,Binuclear
4,0.030717,0.931704,BR00117002,ABL1,final,crispr,C01,592,Elongated


In [5]:
# Focus on the top results for downstream interpretation
jump_focused_top_results_df = (
    jump_pred_df
    .groupby(["Metadata_model_type", "treatment_type", "phenotype"])
    .apply(lambda x: x.nsmallest(n_top_results_to_explore, "p_value"))
    .reset_index(drop=True)
)

jump_focused_top_results_df.to_csv(top_results_summary_file, sep="\t", index=False)

print(jump_focused_top_results_df.shape)
jump_focused_top_results_df.head()

(900, 9)


Unnamed: 0,comparison_metric_value,p_value,Metadata_Plate,treatment,Metadata_model_type,treatment_type,Metadata_Well,cell_count,phenotype
0,0.348498,1.137701e-69,BR00117016,ingenol-mebutate,final,compound,M11,1280,ADCCM
1,0.700355,1.237545e-68,BR00117051,fludarabine-phosphate,final,compound,N12,290,ADCCM
2,0.553765,3.6404599999999997e-65,BR00117015,fludarabine-phosphate,final,compound,N12,459,ADCCM
3,0.572046,6.234639e-63,BR00117019,fludarabine-phosphate,final,compound,N12,412,ADCCM
4,0.331729,9.171405e-62,BR00117015,ingenol-mebutate,final,compound,M11,1259,ADCCM


## Summarize data

In [6]:
# How many unique plates?
jump_pred_df.Metadata_Plate.nunique()

51

In [7]:
# How many different individual treatments?
jump_pred_df.query("Metadata_model_type == 'final'").treatment_type.value_counts()

compound    113235
crispr       86100
orf          42990
Name: treatment_type, dtype: int64

In [8]:
# How many unique treatments per treatment type?
jump_pred_df.groupby("treatment_type").treatment.nunique()

treatment_type
compound    302
crispr      160
orf         160
Name: treatment, dtype: int64

In [9]:
# How many treatments with phenotype predictions?
jump_pred_df.query("Metadata_model_type == 'final'").phenotype.value_counts()

ADCCM                 16155
Anaphase              16155
Apoptosis             16155
Binuclear             16155
Elongated             16155
Grape                 16155
Hole                  16155
Interphase            16155
Large                 16155
Metaphase             16155
MetaphaseAlignment    16155
OutOfFocus            16155
Polylobed             16155
Prometaphase          16155
SmallIrregular        16155
Name: phenotype, dtype: int64

## Convert data to phenotypic profiles

In [10]:
jump_wide_final_df = (
    jump_pred_df
    .query("Metadata_model_type == 'final'")
    .drop(columns=["p_value"])
    .pivot(index=["Metadata_Plate", "treatment", "treatment_type", "Metadata_Well", "cell_count"], columns="phenotype", values="comparison_metric_value")
    .reset_index()
)

jump_wide_final_df.to_csv(final_jump_phenotype_file, sep="\t", index=False)

print(jump_wide_final_df.shape)
jump_wide_final_df.head()

(16155, 20)


phenotype,Metadata_Plate,treatment,treatment_type,Metadata_Well,cell_count,ADCCM,Anaphase,Apoptosis,Binuclear,Elongated,Grape,Hole,Interphase,Large,Metaphase,MetaphaseAlignment,OutOfFocus,Polylobed,Prometaphase,SmallIrregular
0,BR00116991,1-EBIO,compound,P06,1473,0.046737,0.048754,0.050341,0.064248,0.047873,0.042167,0.028808,0.077078,0.06762,0.03109,0.038145,0.05811,0.024753,0.035821,0.034527
1,BR00116991,1-octanol,compound,I24,1111,0.031986,0.085904,0.099348,0.088649,0.028359,0.067738,0.03138,0.088518,0.102193,0.064525,0.032058,0.099526,0.02551,0.03199,0.066391
2,BR00116991,"2,5-furandimethanol",compound,G12,1153,0.027638,0.094782,0.084658,0.063705,0.03549,0.05877,0.039559,0.061289,0.100094,0.063818,0.04028,0.077559,0.046654,0.0444,0.058264
3,BR00116991,2-Oleoylglycerol,compound,H02,1451,0.041859,0.084837,0.078716,0.052371,0.039707,0.063077,0.022829,0.049826,0.075192,0.067639,0.034138,0.074687,0.033886,0.027407,0.062492
4,BR00116991,4-CMTB,compound,C01,1570,0.112659,0.078071,0.070801,0.073806,0.107542,0.105355,0.051031,0.045419,0.050727,0.11181,0.111739,0.092894,0.089923,0.039914,0.068263


In [11]:
jump_wide_shuffled_df = (
    jump_pred_df
    .query("Metadata_model_type == 'shuffled'")
    .drop(columns=["p_value"])
    .pivot(index=["Metadata_Plate", "treatment", "treatment_type", "Metadata_Well", "cell_count"], columns="phenotype", values="comparison_metric_value")
    .reset_index()
)

jump_wide_shuffled_df.to_csv(shuffled_jump_phenotype_file, sep="\t", index=False)

print(jump_wide_shuffled_df.shape)
jump_wide_shuffled_df.head()

(16155, 20)


phenotype,Metadata_Plate,treatment,treatment_type,Metadata_Well,cell_count,ADCCM,Anaphase,Apoptosis,Binuclear,Elongated,Grape,Hole,Interphase,Large,Metaphase,MetaphaseAlignment,OutOfFocus,Polylobed,Prometaphase,SmallIrregular
0,BR00116991,1-EBIO,compound,P06,1473,0.035732,0.047889,0.049714,0.040586,0.035098,0.027401,0.026632,0.057831,0.01939,0.032145,0.044006,0.050428,0.034754,0.031455,0.053313
1,BR00116991,1-octanol,compound,I24,1111,0.040171,0.058095,0.064366,0.052012,0.057874,0.028038,0.031712,0.025602,0.039935,0.031761,0.032016,0.037037,0.02193,0.023784,0.046658
2,BR00116991,"2,5-furandimethanol",compound,G12,1153,0.023798,0.055775,0.066432,0.040479,0.057085,0.024953,0.052325,0.031039,0.035398,0.023379,0.025458,0.044938,0.024937,0.057878,0.058678
3,BR00116991,2-Oleoylglycerol,compound,H02,1451,0.024628,0.02892,0.061039,0.06361,0.061081,0.015462,0.03904,0.030507,0.045197,0.032572,0.033012,0.046498,0.051909,0.050479,0.057157
4,BR00116991,4-CMTB,compound,C01,1570,0.059344,0.084141,0.062821,0.048117,0.04651,0.124347,0.046286,0.064387,0.016084,0.060213,0.07513,0.103736,0.039124,0.039918,0.02144
