# Preprocess

In [1]:
from utils import gctx, clean_doubling_time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from cmapPy.pandasGEXpress.parse import parse

encourage = True
if encourage: print("Arise, arise, Kernels of the Motherboard!")

Arise, arise, Kernels of the Motherboard!


## Load data

In [2]:
# The gctx class simplifies processing of gctx files.
# The warnings below are due to a deprecated use of error="false" in the cmapPy package.
# See https://github.com/cmap/cmapPy/blob/master/cmapPy/pandasGEXpress/parse_gctx.py
data = gctx("/local/data1/simjo484/level5_beta_trt_cp_n720216x12328.gctx")


siginfo = pd.read_csv("/local/data1/simjo484/siginfo_beta.txt", sep="\t", dtype=str)
compoundinfo = pd.read_csv("/local/data1/simjo484/compoundinfo_beta.txt", sep="\t", dtype=str)
cellinfo = pd.read_csv("/local/data1/simjo484/cellinfo_beta.txt", sep="\t", dtype=str)
geneinfo = pd.read_csv("/local/data1/simjo484/geneinfo_beta.txt", sep="\t", dtype=str)

if encourage: print("The first cell has been defeated! Kernel shall be shaken, Cooling fan be splintered,")

  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))


The first cell has been defeated! Kernel shall be shaken, Cooling fan be splintered,


In [3]:
# Create Dataframe with the unique signatures row-wise (and an index)
# The index helps keep track of which rows are duplicated below, so that the labels also can be duplicated appropriately.
X_raw = pd.DataFrame(data.cids, columns=["cids"])
X_raw["index"] = [i for i in range(len(data.cids))]
#print(X_raw.shape)

# Join all the information we have on the unique signature
X_raw = X_raw.merge(siginfo, how="left", left_on="cids", right_on="sig_id")
#print(X_raw.shape)

# Join all the information we have on the pertubagens in general
X_raw = X_raw.merge(compoundinfo, how="left", left_on="pert_id", right_on="pert_id")
#print(X_raw.shape)

# Join cell information
X_raw = X_raw.merge(cellinfo, how="left", on="cell_iname")
#print(X_raw.shape)

# Encode cell type
X_raw = X_raw.join(pd.get_dummies(X_raw["cell_type"], prefix="cell_type"))
#print(X_raw.shape)

# Split up certain factor features
#X_raw["cell_type_tumor"] = X_raw["cell_type"] == "tumor" # type are either "normal" or "tumor" or "pool"
#X_raw.drop("cell_type", axis=1)
#X_raw = X_raw.join(pd.get_dummies(X_raw["cell_type"], prefix="celltype")) # celltype_normal and celltype_tumor

if encourage: print("A second cell has fallen. Let the hope inspire you! Victory awaits!")

A second cell has fallen. Let the hope inspire you! Victory awaits!


In [4]:
# Load the raw labels
y_raw = data.obj.data_df.transpose()

# Identify landmark genes
landmark_gene_ids = geneinfo[geneinfo["feature_space"] == "landmark"]["gene_id"].to_list()
landmark_gene_ids = [str(i) for i in landmark_gene_ids]

# Filter out non-landmark genes
y_raw = y_raw[landmark_gene_ids]

# duplicate as in X_raw
y_raw = y_raw.iloc[X_raw["index"],:]

# Encode cell_lineage
X_raw = X_raw.join(pd.get_dummies(X_raw["cell_lineage"], prefix="cell_lineage"))

# Encode cell growth_pattern
X_raw = X_raw.join(pd.get_dummies(X_raw["growth_pattern"], prefix="growth_pattern"))

# Clean doubling_time
#X_raw.loc[:,"doubling_time"] = [float(120) if val == ">120" else float(val) for val in X_raw["doubling_time"]]
X_raw.loc[:, "doubling_time"] = [clean_doubling_time(i) for i in X_raw.loc[:,"doubling_time"]]
#print(np.unique(X_raw.loc[:,"doubling_time"]))

## THIS IS WHERE WE START THROWING AWAY ROWS AND COLUMNS ##
X = X_raw.copy()
y = y_raw.copy()

if encourage: print("Another cell tumbles down before our Kernels, defeated! The admins smile upon us on this glorious day!")
#if ride == True: print("Cell shall be shaken, shield be splintered,")

Another cell tumbles down before our Kernels, defeated! The admins smile upon us on this glorious day!


In [5]:
# Remove observations with dose unit uM, because it does not play nice with the other does, and there are not many such observations
#ind = (X["pert_dose_unit"] != "uM") & (X["pert_dose_unit"] != "nan") & (X["pert_dose_unit"].isna() == False)
#ind = ind.to_list()
#X = X[ind]
#y = y[ind]

# Remove observations where dose unit is *not* micromolar (uM), because there are not many of them, and we want standardised data.
ind = (X["pert_dose_unit"] == "uM")
ind = ind.to_list()
X = X[ind]
y = y[ind]

# Convert dose to float (many are str, like "0.6").
X.loc[:, "pert_dose"] = [float(i) for i in X["pert_dose"]]

# Standardise / normalise dose to the interval [0, 1]
X.loc[:,"pert_dose"] = (X["pert_dose"] - min(X["pert_dose"])) / (max(X["pert_dose"]) - min(X["pert_dose"]))
#X.loc[:,"dose"] = X["pert_dose"]
#X.drop("pert_dose", axis=1)

# Rename pert_dose
X.loc[:,"dose"] = X.loc[:, "pert_dose"]
#X.drop("pert_dose", axis=1)

if encourage: print("Victory is within our grasp! Fight so that the algorithms back home may live yet another day!")

Victory is within our grasp! Fight so that the algorithms back home may live yet another day!


In [6]:
# Convert all doses to same unit (mg/ml) and then normalise
#X.loc[:,"pert_dose"] = [standardize_dose_unit(dose, unit) for dose, unit in zip(X["pert_dose"], X["pert_dose_unit"])]
#X.loc[:,"pert_dose"] = (X["pert_dose"] - min(X["pert_dose"])) / (max(X["pert_dose"]) - min(X["pert_dose"]))
#X.loc[:,"dose"] = X["pert_dose"]
#X.drop("pert_dose", axis=1)

In [7]:
# Just for a test, we remove all but a few arbitrary numerical features
cell_lineage_columns = ["cell_lineage_" + i for i in ["breast", "haematopoietic_and_lymphoid_tissue", "kidney", "large_intestine", "liver", "lung", "prostate", "skin"]]
growth_pattern_columns = ["growth_pattern_" + i for i in ['adherent', 'mix', 'suspension', 'unknown']]
X = X[["dose", "pert_time", "cell_type_tumor", "cell_lineage_skin", "doubling_time"] + cell_lineage_columns + growth_pattern_columns]

if encourage: print("Onwards! Victory is on the horison!")

Onwards! Victory is on the horison!


In [8]:
# Deal with Na's. Very naive at the moment.
X = X.fillna(0)

if encourage: print("All but one foul cell remains! Slay it!")

All but one foul cell remains! Slay it!


  X = X.fillna(0)


In [9]:
# Train, val, and test split
X_train, X_intermed, y_train, y_intermed = train_test_split(X, y, train_size=0.7)
X_valid, X_test, y_valid, y_test = train_test_split(X_intermed, y_intermed, train_size=0.5)

if encourage: print("The cells have fallen! Ride now, ride now! Ride to the next Notebook!")

The cells have fallen! Ride now, ride now! Ride to the next Notebook!


# Inspecting the data

In [10]:
######### COMMENTS AND NOTES #########
#sum(X_raw["pert_type"] == "trt_aby") # remove "trt_aby"?

######### COMMENTS AND NOTES #########


# build_name is in siginfo_beta.txt, but I can't find any information on it i the netadatafiles.
#keep_columns = ["pert_dose", "pert_time", "pert_type", "cell_iname", "donor_age", "cell_lineage"]
#drop_columns = ["cids", "bead_batch", "nearest_dose", "pert_dose_unit", "pert_idose", "pert_itime", "pert_time_unit", "cell_mfc_name", "pert_mfc_id", "nsample", "cc_q75", "ss_ngene", "tas", "wt", "median_recall_rank_spearman", "median_recall_rank_wtcs_50", "median_recall_score_spearman", "median_recall_score_wtcs_50", "batch_effect_tstat", "is_hiq", "qc_pass", "qc_pass", "det_wells", "det_plates", "distil_ids", "project_code", "pct_self_rank_q25", "batch_effect_tstat", "pert_id", "batch_effect_tstat_pct", "sig_id", "cell_iname", "build_name", "cmap_name_x", "is_exemplar_sig", "is_ncs_sig", "is_null_sig", "cmap_name_y"]

#drop_columns += ["moa", "target"] #These could be interesting, but there is some issue (moa: all nan, )

#X_raw.drop(drop_columns + keep_columns, axis=1)

#set(X["doubling_time"])
#[(float(val) if type(val) == str else float(val.removeprefix(">"))) for val in X["doubling_time"]]

#type(X["doubling_time"][0])