In [1]:
import pandas as pd 
import numpy as np
import re
import pickle

In [2]:
integrated = pd.read_csv("/group/iorio/Raffaele/SCDRESP_data/data/post_corrected_psclones.csv", index_col=0)
psclone_dresp = pd.read_csv("/group/iorio/Raffaele/SCDRESP_data/data/psclones_dresp_predictions_rep1.csv", index_col=0)

In [3]:
gdsc=pd.read_excel("/group/iorio/Raffaele/SCDRESP_data/data/gdsc/GDSC2_fitted_dose_response.xlsx")
gdsc.rename(columns={"SANGER_MODEL_ID":"SangerModelID"},inplace=True)

In [4]:
drugs = pd.read_table("drugs_list.txt",header=None)
drugs = drugs[0].values

In [5]:
gex_corrected = integrated[integrated["type"] == "bulk"]
gex_corrected = gex_corrected.drop(columns="type").reset_index(drop=True)
gex_corrected = gex_corrected.reset_index(drop=True)

In [6]:
clines_gex = set(gex_corrected["SangerModelID"].unique())
genes_gex = gex_corrected.drop(columns={"SangerModelID", "tissue"}).columns

training_sets_dict = {}

for dname in drugs:
    dresp_sub_df = gdsc[gdsc["DRUG_NAME"] == dname]
    clines_dresp = set(dresp_sub_df["SangerModelID"].unique())
    clines_to_keep = clines_dresp & clines_gex

    if len(clines_to_keep) < 250:
        continue

    dresp_filtered = dresp_sub_df[dresp_sub_df["SangerModelID"].isin(clines_to_keep)]
    gex_filtered = gex_corrected[gex_corrected["SangerModelID"].isin(clines_to_keep)]
    
    df = gex_filtered.merge(dresp_filtered, on="SangerModelID", how="left")
    
    # Select lineage with at least 10 cell lines
    octype_counts = df["tissue"].value_counts()
    octype_counts = octype_counts[octype_counts > 10].index
    df = df[df["tissue"].isin(octype_counts)]

    X = df.loc[:, df.columns.isin(genes_gex)]

    # Save filtered dataset
    training_sets_dict[dname] = {
        "OBS": df["SangerModelID"].reset_index(drop=True),
        "X": X.reset_index(drop=True),  # Retaining only HVF genes
        "Y": df["LN_IC50"].reset_index(drop=True),
        "Metadata": df[["tissue", "CELL_LINE_NAME", "PATHWAY_NAME", "COSMIC_ID"]].reset_index(drop=True)
    }

In [7]:
genes_gex = integrated.drop(columns={"SangerModelID", "tissue","type"}).columns

In [8]:
# create concatenated datasets ready for ML(bulk + clones)

training_sets_dict_pclone = dict()

# retain HVF for integrated dataset
X = integrated.loc[:, integrated.columns.isin(genes_gex)]
cv = np.std(X, axis=0)/np.mean(X, axis=0)
threshold = np.percentile(cv,90)
hvf = cv[cv > threshold].index
X_hvf = X.loc[:, X.columns.isin(hvf)]
    
for k, v in training_sets_dict.items():
    
    y_bulk = training_sets_dict[k]["Y"]
    obs_bulk = training_sets_dict[k]["OBS"]
    X_bulk = training_sets_dict[k]["X"]
    
    y_pclone = psclone_dresp[k]
    obs_pclone = psclone_dresp["SangerModelID"]
    X_pclone = X_hvf.loc[0:191,:].reset_index(drop=True) # 0:191, significa prendere solo gli pseudclones 
  
    X_concatenated = pd.concat([X_pclone, X_bulk.loc[:,X_bulk.columns.isin(hvf)]]).reset_index(drop=True)
    
    training_sets_dict_pclone[k]= {
        "OBS":pd.concat([obs_pclone, obs_bulk]).reset_index(drop=True),
        "X":X_concatenated,
        "Y":pd.concat([y_pclone, y_bulk]).reset_index(drop=True)
    }

In [10]:
with open("/group/iorio/Raffaele/SCDRESP_data/data/training_sets_dict_psclone.pkl", "wb") as f:
    pickle.dump(training_sets_dict_pclone, f)