# Dependencies

In [None]:
from tools.utils import gctx
from tools.transform_data import standardize_dose_unit
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and Preprocess data

In [None]:
data = gctx("Data/level5_beta_trt_misc_n8283x12328.gctx")
siginfo = pd.read_csv("Data/siginfo_beta.txt", sep="\t")
compoundinfo = pd.read_csv("Data/compoundinfo_beta.txt", sep="\t")
cellinfo = pd.read_csv("Data/cellinfo_beta.txt", sep="\t")

# Create Dataframe with the unique signatures row-wise
X_raw = pd.DataFrame(data.cids, columns=["cids"])

# Join all the information we have on the unique signature
X_raw = X_raw.merge(siginfo, how="left", left_on="cids", right_on="sig_id")

# Join all the information we have on the pertubagens in general
X_raw = X_raw.merge(compoundinfo, how="left", left_on="pert_id", right_on="pert_id")

# Join cell information
X_raw = X_raw.merge(cellinfo, how="left", on="cell_iname")

# Split up certain factor features
X_raw["cell_type_tumor"] = X_raw["cell_type"] == "tumor" # type are either "normal" or "tumor"
X_raw.drop("cell_type", axis=1)
#X_raw = X_raw.join(pd.get_dummies(X_raw["cell_type"], prefix="celltype")) # celltype_normal and celltype_tumor

# Load the raw labels
y_raw = data.obj.data_df.transpose()

# Filter out non-landmark genes
geneinfo = pd.read_csv("Data/geneinfo_beta.txt", sep="\t")
landmark_gene_ids = geneinfo[geneinfo["feature_space"] == "landmark"]["gene_id"].to_list()
landmark_gene_ids = [str(i) for i in landmark_gene_ids]
y_raw = y_raw[landmark_gene_ids]

# Encode cell_lineage
X_raw = X_raw.join(pd.get_dummies(X_raw["cell_lineage"], prefix="cell_lineage"))

# Encode cell growth_pattern
X_raw = X_raw.join(pd.get_dummies(X_raw["growth_pattern"], prefix="growth_pattern"))

# Clean doubling_time
X_raw.loc[:,"doubling_time"] = [float(120) if val == ">120" else float(val) for val in X_raw["doubling_time"]]

## THIS IS WHERE WE START THROWING AWAY ROWS AND COLUMNS ##
X = X_raw
y = y_raw

# Remove observations with dose unit uM, because it does not play nice with the other does, and there are not many such observations
ind = (X["pert_dose_unit"] != "uM") & (X["pert_dose_unit"] != "nan") & (X["pert_dose_unit"].isna() == False)
ind = ind.to_list()
X = X[ind]
y = y[ind]

# Convert all doses to same unit (mg/ml) and then normalise
X.loc[:,"pert_dose"] = [standardize_dose_unit(dose, unit) for dose, unit in zip(X["pert_dose"], X["pert_dose_unit"])]
X.loc[:,"pert_dose"] = (X["pert_dose"] - min(X["pert_dose"])) / (max(X["pert_dose"]) - min(X["pert_dose"]))
X.loc[:,"dose"] = X["pert_dose"]
X.drop("pert_dose", axis=1)

# Just for a test, we remove all but a few arbitrary numerical features
cell_lineage_columns = ["cell_lineage_" + i for i in ["breast", "haematopoietic_and_lymphoid_tissue", "kidney", "large_intestine", "liver", "lung", "prostate", "skin"]]
growth_pattern_columns = ["growth_pattern_" + i for i in ['adherent', 'mix', 'suspension', 'unknown']]
X = X[["dose", "pert_time", "cell_type_tumor", "cell_lineage_skin", "doubling_time"] + cell_lineage_columns]

# Deal with Na's. Very naive at the moment.
X = X.fillna(0)

# Train, val, and test split
X_train, X_intermed, y_train, y_intermed = train_test_split(X, y, train_size=0.7)
X_valid, X_test, y_valid, y_test = train_test_split(X_intermed, y_intermed, train_size=0.5)

  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  siginfo = pd.read_csv("Data/siginfo_beta.txt", sep="\t")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:,"dose"] = X["pert_dose"]


# Inspecting the data

In [None]:
######### COMMENTS AND NOTES #########
#sum(X_raw["pert_type"] == "trt_aby") # remove "trt_aby"?

######### COMMENTS AND NOTES #########


# build_name is in siginfo_beta.txt, but I can't find any information on it i the netadatafiles.
#keep_columns = ["pert_dose", "pert_time", "pert_type", "cell_iname", "donor_age", "cell_lineage"]
#drop_columns = ["cids", "bead_batch", "nearest_dose", "pert_dose_unit", "pert_idose", "pert_itime", "pert_time_unit", "cell_mfc_name", "pert_mfc_id", "nsample", "cc_q75", "ss_ngene", "tas", "wt", "median_recall_rank_spearman", "median_recall_rank_wtcs_50", "median_recall_score_spearman", "median_recall_score_wtcs_50", "batch_effect_tstat", "is_hiq", "qc_pass", "qc_pass", "det_wells", "det_plates", "distil_ids", "project_code", "pct_self_rank_q25", "batch_effect_tstat", "pert_id", "batch_effect_tstat_pct", "sig_id", "cell_iname", "build_name", "cmap_name_x", "is_exemplar_sig", "is_ncs_sig", "is_null_sig", "cmap_name_y"]

#drop_columns += ["moa", "target"] #These could be interesting, but there is some issue (moa: all nan, )

#X_raw.drop(drop_columns + keep_columns, axis=1)

#set(X["doubling_time"])
#[(float(val) if type(val) == str else float(val.removeprefix(">"))) for val in X["doubling_time"]]

#type(X["doubling_time"][0])