In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew
import igraph
import os
pd.set_option('display.max_columns', 30)

# load data

In [2]:
ddir = "/home/scai/PhenPred/data/"
data_folder = "/home/scai/PhenPred/data/clines"

In [3]:
# Import samplesheets
cols = ["model_id", "BROAD_ID", "tissue", "cancer_type"]
col_rename = dict(
    ModelID="BROAD_ID",
    SangerModelID="model_id",
    SampleCollectionSite="tissue",
    OncotreeLineage="cancer_type",
)
ss_cmp = pd.read_csv(f"{data_folder}/model_list_20230505.csv")

ss_depmap = pd.read_csv(f"{data_folder}/depmap24Q4/Model.csv")
ss_depmap.rename(columns=col_rename, inplace=True)

# Map sample IDs to Sanger IDs
samplesheet = pd.concat(
    [
        ss_cmp[cols].dropna().assign(source="sanger"),
        ss_depmap[cols].dropna().assign(source="broad"),
    ]
)
samplesheet = samplesheet.groupby("model_id").first().reset_index()
samplesheet.replace(
    {
        "tissue": dict(
            large_intestine="Large Intestine",
            lung="Lung",
            ovary="Ovary",
            haematopoietic_and_lymphoid_tissue="Haematopoietic and Lymphoid",
            bone_marrow="Other tissue",
            upper_aerodigestive_tract="Other tissue",
            ascites="Other tissue",
            pleural_effusion="Other tissue",
        )
    },
    inplace=True,
)
tissue_map = samplesheet.set_index("model_id").to_dict()["tissue"]

# Growth
growth = pd.read_csv(f"{data_folder}/growth_rate_20220907.csv")
growth = (
    growth.sort_values(["model_id", "replicates"], ascending=False)
    .groupby("model_id")
    .first()
)
growth = growth.dropna(subset=["day4_day1_ratio"])

In [4]:
samplesheet.head()

Unnamed: 0,model_id,BROAD_ID,tissue,cancer_type,source
0,SIDM00001,ACH-000405,Haematopoietic and Lymphoid,Other Blood Cancers,sanger
1,SIDM00002,ACH-002340,Peripheral Nervous System,Neuroblastoma,sanger
2,SIDM00003,ACH-002159,Skin,Melanoma,sanger
3,SIDM00005,ACH-000044,Breast,Breast Carcinoma,sanger
4,SIDM00006,ACH-001552,Skin,Other Solid Cancers,sanger


In [5]:
samplesheet['tissue'].unique()

array(['Haematopoietic and Lymphoid', 'Peripheral Nervous System', 'Skin',
       'Breast', 'Ovary', 'Large Intestine', 'Esophagus', 'Lung',
       'Head and Neck', 'Central Nervous System', 'Kidney', 'Soft Tissue',
       'Bladder', 'Bone', 'Thyroid', 'Endometrium', 'Stomach', 'Pancreas',
       'Liver', 'Cervix', 'Eye', 'Prostate', 'Biliary Tract', 'Uterus',
       'Testis', 'Other tissue', 'Placenta', 'Small Intestine',
       'Adrenal Gland', 'Vulva', 'Unknown'], dtype=object)

In [6]:
PALETTE_TTYPE = {
    "Lung": "#007fff",
    "Prostate": "#665d1e",
    "Stomach": "#ffbf00",
    "Central Nervous System": "#fbceb1",
    "Skin": "#ff033e",
    "Bladder": "#ab274f",
    "Haematopoietic and Lymphoid": "#d5e6f7",
    "Kidney": "#7cb9e8",
    "Thyroid": "#efdecd",
    "Soft Tissue": "#8db600",
    "Head and Neck": "#e9d66b",
    "Ovary": "#b284be",
    "Bone": "#b2beb5",
    "Endometrium": "#10b36f",
    "Breast": "#6e7f80",
    "Pancreas": "#ff7e00",
    "Peripheral Nervous System": "#87a96b",
    "Cervix": "#c9ffe5",
    "Large Intestine": "#9f2b68",
    "Liver": "#00ffff",
    "Vulva": "#008000",
    "Esophagus": "#cd9575",
    "Biliary Tract": "#72a0c1",
    "Other tissue": "#a32638",
    "Small Intestine": "#9966cc",
    "Placenta": "#f19cbb",
    "Testis": "#e32636",
    "Adrenal Gland": "#3b7a57",
    "Uterus": "#7a3b5e",
    "Unknown": "#a32638",
    "Eye": "#ff1493",
}

In [7]:
# timestamp = "20250225_145621"
timestamp = "20250508_160635"
# Datasets - synthetic
## Transcriptomics
gexp_df = pd.read_csv(
    f"/home/scai/PhenPred/reports/vae/files/{timestamp}_imputed_transcriptomics.csv.gz",
    index_col=0,
)

## CRISPR-Cas9
cas9_df = pd.read_csv(
    f"/home/scai/PhenPred/reports/vae/files/{timestamp}_imputed_crisprcas9.csv.gz",
    index_col=0,
)

In [8]:
growth.shape

(964, 6)

In [9]:
gexp_measured = pd.read_csv(
    f"{data_folder}/depmap24Q4/OmicsExpressionGenesExpectedCountProfileVoom.csv",
    index_col=0,
).T
gexp_measured = gexp_measured.rename(
    index=samplesheet.reset_index().groupby("BROAD_ID").first()["model_id"]
)
gexp_measured = gexp_measured[gexp_measured.index.isin(gexp_df.index)]

In [10]:
## CRISPR-Cas9
cas9_measured = pd.read_csv(
    f"{data_folder}/depmap24Q4/CRISPRGeneEffect.csv", index_col=0
)
cas9_measured.columns = cas9_measured.columns.str.split(" ").str[0]
# cas9_measured = scale(cas9_measured.T).T
cas9_measured = cas9_measured.rename(
    index=samplesheet.reset_index().groupby("BROAD_ID").first()["model_id"]
)
cas9_measured = cas9_measured[cas9_measured.index.isin(cas9_df.index)]

In [11]:
measured_gexp_only = list(set(gexp_measured.index) - set(cas9_measured.index))
measured_cas9_only = list(set(cas9_measured.index) - set(gexp_measured.index))
measured_both = list(set(gexp_measured.index) & set(cas9_measured.index))
measured_no_cas9 = list(set(cas9_df.index) - set(cas9_measured.index))

In [12]:
measured_groups = {
    "both": measured_both,
    "gexp_only": measured_gexp_only,
    "cas9_only": measured_cas9_only,
    "none": [],  # Will be automatically assigned for remaining samples
}

In [13]:
df_res_vae_annot = pd.read_csv(
    f"../reports/vae/crispr/{timestamp}_transcriptomics_crisprcas9_remove_latent_n3_no_tissue_standardizedTrue_annot.csv.gz"
)
df_res_vae_annot["log10fdr_orig"] = -np.log10(df_res_vae_annot["fdr_orig"])
df_res_vae_annot["log10fdr_vae"] = -np.log10(df_res_vae_annot["fdr_vae"])
df_res_vae_annot["diff_log10fdr"] = (
    df_res_vae_annot["log10fdr_vae"] - df_res_vae_annot["log10fdr_orig"]
)

In [37]:
df_res_vae_annot_filtered = df_res_vae_annot.query(
    "fdr_vae < 0.05 and beta_vae > 0"
)
df_res_vae_annot_filtered = df_res_vae_annot_filtered[
    df_res_vae_annot_filtered["y_id"].isin(gexp_df.columns)
]
# Cap y_id rows to 3 per group
df_res_vae_annot_filtered_cap = (
    df_res_vae_annot_filtered.groupby("y_id").head(3).reset_index(drop=True)
)
df_res_vae_annot_filtered_cap_top = df_res_vae_annot_filtered_cap.head(6000)

In [38]:
df_res_vae_annot_filtered_cap_top

Unnamed: 0,y_id,x_id,n_orig,beta_orig,lr_orig,covs_orig,pval_orig,fdr_orig,n_vae,beta_vae,lr_vae,covs_vae,pval_vae,fdr_vae,skew_orig,skew_mosa,target_detailed,target,entropy,log10fdr_orig,log10fdr_vae,diff_log10fdr
0,FAM50A,FAM50B,923.0,0.75825,775.49438,208.0,1.148490e-170,1.975632e-166,1523.0,0.62305,846.16667,211.0,4.953857e-186,1.802213e-182,-0.62729,-0.38142,No link; CRISPR not in network,-,0.82245,165.704294,181.744194,16.039900
1,DDX3X,DDX3Y,923.0,0.70104,628.13085,208.0,1.274517e-138,2.192424e-134,1523.0,0.54253,592.67442,211.0,6.564286e-131,2.388087e-127,0.54274,-0.18181,3,3,0.83160,133.659076,126.621950,-7.037126
2,DDX3X,USP9Y,923.0,0.66805,569.93911,208.0,5.788442e-126,9.957278e-122,1523.0,0.50986,522.08826,211.0,1.486670e-115,5.408505e-112,0.54274,-0.18181,3,3,0.83160,121.001859,111.266923,-9.734937
3,DDX3X,UTY,923.0,0.67039,566.34511,208.0,3.502334e-125,6.024714e-121,1523.0,0.50949,523.46762,211.0,7.449375e-116,2.710083e-112,0.54274,-0.18181,3,3,0.83160,120.220064,111.567017,-8.653046
4,EIF1AX,EIF1AY,923.0,0.69643,563.47514,208.0,1.474560e-124,2.536539e-120,1523.0,0.54443,528.78237,211.0,5.198192e-117,1.891102e-113,0.61340,-0.24279,1,1,0.83172,119.595758,112.723285,-6.872473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,ALG1,ZC3H12C,923.0,0.12221,12.62258,208.0,3.811156e-04,8.835710e-02,1523.0,0.11467,17.28934,211.0,3.209821e-05,2.919332e-02,0.04987,0.32339,No link; Gene not in network,-,0.83240,1.053759,1.534716,0.480958
5996,PEX26,KRT23,923.0,0.17191,15.78221,208.0,7.106763e-05,8.840882e-02,1523.0,0.14728,19.65756,211.0,9.263592e-06,4.941807e-03,-1.00856,-0.46627,No link; Gene not in network,-,0.38794,1.053504,2.306114,1.252610
5997,PEX10,KRT23,923.0,0.17360,15.75880,208.0,7.195230e-05,8.840882e-02,1523.0,0.14578,19.60766,211.0,9.508699e-06,4.941807e-03,-0.72398,0.12880,No link; Gene not in network,-,0.49754,1.053504,2.306114,1.252610
5998,PDCD6,EPHA3,923.0,0.14164,15.96840,208.0,6.440862e-05,8.841002e-02,1523.0,0.11207,17.63382,211.0,2.677840e-05,4.855632e-02,-0.73838,-0.21157,4,4,0.75148,1.053498,1.313754,0.260256


# selection algo

In [39]:
from cell_line_selection_updated import select_validation_cell_lines

In [40]:
FIXED_CELL_LINES = ["SIDM00136", "SIDM00795"]
candidate_cell_lines = list(set(measured_no_cas9 + FIXED_CELL_LINES))

In [41]:
len(candidate_cell_lines)

506

In [42]:
results = select_validation_cell_lines(
    df_res_vae_annot_filtered_cap_top,  # Updated data source
    gexp_df,
    cas9_df,
    crispr_threshold_percentile=50,
    target_expression_threshold_percentile=50,
    biomarker_expression_threshold_percentile=50,
    create_visualizations=True,
    output_dir="./validation_results_3_criteria",
    tissue_map=tissue_map,
    fixed_cell_lines=["SIDM00136", "SIDM00795"],
    candidate_cell_lines=candidate_cell_lines,
    power=2
)

Starting optimized cell line selection pipeline with 3-criteria percentile-based filtering...
Using gene-specific percentile-based thresholds...
Threshold percentiles:
  CRISPR threshold: 50th percentile per target gene
  Target expression threshold: 50th percentile per target gene
  Biomarker expression threshold: 50th percentile per biomarker gene
Starting with 2 fixed cell lines: ['SIDM00136', 'SIDM00795']
Restricting selection to 506 candidate cell lines
Extracting gene pairs and weights...
Calculating validation scores with gene-specific percentile-based criteria...
Found 1523 cell lines common to both expression and CRISPR datasets
Processing 6000 gene pairs with 20 parallel jobs...
Applying gene-specific thresholds: CRISPR > 50th percentile per target gene AND Target Expression > 50th percentile per target gene AND Biomarker Expression > 50th percentile per biomarker gene
Score calculation completed in 105.83 seconds
6000 out of 6000 gene pairs have at least one cell line
that p

Selecting additional cell lines: 100%|██████████| 4/4 [00:00<00:00,  6.84it/s, covered=302]


Cell line selection completed in 6.22 seconds
Selected 6 cell lines that collectively cover 1853 of 6000 gene pairs (30.88%)
Evaluating cell line selection...

Total pipeline execution time: 113.41 seconds

----- RESULTS SUMMARY -----

RESULTS WITH GENE-SPECIFIC PERCENTILE THRESHOLDS:
CRISPR > 50th percentile per target gene AND
Target Expression > 50th percentile per target gene AND
Biomarker Expression > 50th percentile per biomarker gene

Selection restricted to 506 candidate cell lines

Selected Cell Lines:
1. SIDM00136 (Large Intestine) [FIXED] - Covers 245 gene pairs (4.08% of total)
2. SIDM00795 (Skin) [FIXED] - Covers 86 gene pairs (1.43% of total)
3. SIDM00994 (Haematopoietic and Lymphoid) [SELECTED] - Covers 513 gene pairs (8.55% of total)
4. SIDM00849 (Kidney) [SELECTED] - Covers 376 gene pairs (6.27% of total)
5. SIDM00338 (Haematopoietic and Lymphoid) [SELECTED] - Covers 331 gene pairs (5.52% of total)
6. SIDM00971 (Breast) [SELECTED] - Covers 302 gene pairs (5.03% of tota

In [43]:
results["coverage_matrix"].shape

(6000, 1523)

In [44]:
results["validation_scores"][results["selected_cell_lines"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,SIDM00136,SIDM00795,SIDM00994,SIDM00849,SIDM00338,SIDM00971
target_gene,biomarker_gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FAM50A,FAM50B,0.0,0.0,0.0,0.705624,0.0,0.415598
DDX3X,DDX3Y,0.0,0.0,0.0,0.000000,0.0,0.000000
DDX3X,USP9Y,0.0,0.0,0.0,0.000000,0.0,0.000000
DDX3X,UTY,0.0,0.0,0.0,0.000000,0.0,0.000000
EIF1AX,EIF1AY,0.0,0.0,0.0,0.715823,0.0,0.000000
...,...,...,...,...,...,...,...
ALG1,ZC3H12C,0.0,0.0,0.0,0.000000,0.0,0.000000
PEX26,KRT23,0.0,0.0,0.0,0.000000,0.0,0.000000
PEX10,KRT23,0.0,0.0,0.0,0.000000,0.0,0.000000
PDCD6,EPHA3,0.0,0.0,0.0,0.249767,0.0,0.000000


In [45]:
coverage_matrix_df = results["coverage_matrix"][results["selected_cell_lines"]]

In [46]:
coverage_matrix_df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,SIDM00136,SIDM00795,SIDM00994,SIDM00849,SIDM00338,SIDM00971
target_gene,biomarker_gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FAM50A,FAM50B,0,0,0,1,0,1
DDX3X,DDX3Y,0,0,0,0,0,0
DDX3X,USP9Y,0,0,0,0,0,0
DDX3X,UTY,0,0,0,0,0,0
EIF1AX,EIF1AY,0,0,0,1,0,0
RPP25L,RPP25,1,0,0,0,0,1
EIF1AX,KDM5D,0,0,0,1,0,0
EIF1AX,DDX3Y,0,0,0,1,0,0
DNAJC19,DNAJC15,0,0,0,0,1,0
TTC7A,TTC7B,0,0,0,0,1,0


In [47]:
coverage_matrix_df[coverage_matrix_df.sum(axis=1) > 0].shape

(1853, 6)

In [48]:
coverage_matrix_df[coverage_matrix_df.sum(axis=1) > 0].to_csv(
    "./validation_results_3_criteria/coverage_matrix_df_filtered.csv"
)

In [36]:
# You can access the selected cell lines
selected_cell_lines = results["selected_cell_lines"]
print("Selected cell lines:", selected_cell_lines)

# Get the gene pairs that each cell line is best for demonstrating
for i, cell_line in enumerate(selected_cell_lines):
    examples = results["gene_pair_examples"][i]
    print(f"\nTop gene pairs for {cell_line}:")
    for cas9_gene, gexp_gene, score in examples[:10]:  # Show top 10
        print(f"  {cas9_gene} (CRISPR) - {gexp_gene} (Expr) - Score: {score:.4f}")

Selected cell lines: ['SIDM00136', 'SIDM00795', 'SIDM00994', 'SIDM00849', 'SIDM00338', 'SIDM00971']

Top gene pairs for SIDM00136:
  RPP25L (CRISPR) - RPP25 (Expr) - Score: 5.0000
  POP7 (CRISPR) - RPP25 (Expr) - Score: 1.3239
  LIF (CRISPR) - TRA2A (Expr) - Score: 1.0351
  PPM1G (CRISPR) - TRA2A (Expr) - Score: 1.0264
  CHMP4B (CRISPR) - CHMP4C (Expr) - Score: 1.0250
  UQCC3 (CRISPR) - UQCRH (Expr) - Score: 1.0114
  EIF6 (CRISPR) - MROH1 (Expr) - Score: 1.0084
  TCF3 (CRISPR) - ETFB (Expr) - Score: 1.0072
  CDK6 (CRISPR) - CDKN2A (Expr) - Score: 1.0070
  ECHS1 (CRISPR) - HSD17B8 (Expr) - Score: 1.0063

Top gene pairs for SIDM00795:
  NAA10 (CRISPR) - NAA11 (Expr) - Score: 3.0246
  GSPT1 (CRISPR) - GSPT2 (Expr) - Score: 1.5304
  TP53 (CRISPR) - PTCHD4 (Expr) - Score: 1.1200
  TP53 (CRISPR) - EDA2R (Expr) - Score: 1.0975
  SNAI2 (CRISPR) - SNAI1 (Expr) - Score: 1.0228
  ALAS1 (CRISPR) - SYDE1 (Expr) - Score: 1.0102
  NAA10 (CRISPR) - XAGE1B (Expr) - Score: 1.0061
  STXBP3 (CRISPR) - KIF