In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import os
import matplotlib.colors as clrs
from IPython.core.display import display, HTML
import anndata
from sklearn.metrics import silhouette_score
from joblib import parallel_backend
import math
import re
import pickle

  from IPython.core.display import display, HTML


In [2]:
gdscat2=pd.read_csv("/group/iorio/Raffaele/SCDRESP_data/data/gdsc/GDSCatSquare-009_matrix_results.csv")
gdsc=pd.read_excel("/group/iorio/Raffaele/SCDRESP_data/data/gdsc/GDSC2_fitted_dose_response.xlsx")
gdsc.rename(columns={"SANGER_MODEL_ID":"SangerModelID"},inplace=True)
gex=pd.read_csv("/group/iorio/Raffaele/SCDRESP_data/data/ccle/OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv",index_col=0)
genes=[re.sub(r"\s*\(.*?\)", "", gene) for gene in gex.columns]
gex.columns=genes
gex=gex.reset_index().rename(columns={"index":"ModelID"})

In [3]:
model_broad=pd.read_csv("/group/iorio/Raffaele/SCDRESP_data/data/Model.csv")
model_broad=model_broad[["ModelID","SangerModelID","OncotreeSubtype","COSMICID","OncotreeLineage"]]

In [4]:
gex_extended = gex.merge(model_broad, on="ModelID", how="left")

In [5]:
gex = gex_extended.iloc[:,1:-4]

In [6]:
gex.index = gex_extended["SangerModelID"].values

In [7]:
gex["SangerModelID"] = gex.index

In [8]:
gex["tissue"]=gex_extended["OncotreeLineage"].values

In [9]:
drugs = set(gdsc["DRUG_NAME"].unique())

In [10]:
clines_gex = set(gex["SangerModelID"].unique())
genes_gex = gex.drop(columns={"SangerModelID","tissue"}).columns

training_sets_dict = {}

for dname in drugs:
    dresp_sub_df = gdsc[gdsc["DRUG_NAME"] == dname]
    clines_dresp = set(dresp_sub_df["SangerModelID"].unique())
    clines_to_keep = clines_dresp & clines_gex

    if len(clines_to_keep) < 250:
        continue

    dresp_filtered = dresp_sub_df[dresp_sub_df["SangerModelID"].isin(clines_to_keep)]
    gex_filtered = gex[gex["SangerModelID"].isin(clines_to_keep)]
    
    df = gex_filtered.merge(dresp_filtered, on="SangerModelID", how="left")
    
    # Select lineage with at least 10 cell lines
    octype_counts = df["tissue"].value_counts()
    octype_counts = octype_counts[octype_counts > 10].index
    df = df[df["tissue"].isin(octype_counts)]

    # retain HVF
    X = df.loc[:, df.columns.isin(genes_gex)]
    cv = np.std(X, axis=0)/np.mean(X, axis=0)
    threshold = np.percentile(cv,90)
    hvf = cv[cv > threshold].index
    X_hvf = X.loc[:, X.columns.isin(hvf)]

    # Save filtered dataset
    training_sets_dict[dname] = {
        "OBS": df["SangerModelID"].reset_index(drop=True),
        "X": X_hvf.reset_index(drop=True),  # Retaining only HVF genes
        "Y": df["LN_IC50"].reset_index(drop=True),
        "Metadata": df[["tissue", "CELL_LINE_NAME", "PATHWAY_NAME", "COSMIC_ID"]].reset_index(drop=True)
    }


In [11]:
with open("/group/iorio/Raffaele/SCDRESP_data/data/training_sets_dict_baseline.pkl", "wb") as f:
    pickle.dump(training_sets_dict, f)