In [1]:
# -------- #
# PACKAGES #
# -------- #
import time
import pickle
import torch
import torch.nn          as nn
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt 
import seaborn           as sns

from torch.utils.data import Dataset, DataLoader
from my_utils.preprocessing_helpers import convert_column_types

# ---------------- #
# GENERAL SETTINGS #
# ---------------- #
torch.manual_seed(42)
sns.set_theme(style="white")

# ---------------- #
# GLOBAL VARIABLES #
# ---------------- #
from config import (
    # Drug features.
    PATH_TO_SAVED_DRUG_FEATURES,
    DRUG_FPS_FINAL_FILE_NAME,
    # Cell line features.
    PATH_TO_SAVED_CL_FEATURES,
    GENE_EXPR_FINAL_FILE_NAME,
    CNV_GISTIC_FINAL_FILE_NAME,
    CNV_PICNIC_FINAL_FILE_NAME    
)

PATH_TO_SAVE_MODELS_TO = '../../datasets/models/'
PATH_TO_SAVE_DATA_TO = '../../datasets/gdsc/my_datasets/'

---

## Getting Base Datasets

The goal of this notebook is to build classic and simple ML models which uses tabular data of cell line and genes.

- [ ] Ridge Regression
- [ ] Random Forrest
- [ ] MOLI (FCN)

In [2]:
# Gene expression.
start = time.time()
gene_expr = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{GENE_EXPR_FINAL_FILE_NAME}')
print(f"""Gene Expression: {gene_expr.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(gene_expr.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(gene_expr.columns[14:])}
""")

# CNV gistic.
start = time.time()
cnv_gistic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_GISTIC_FINAL_FILE_NAME}')
print(f"""CNV Gistic: {cnv_gistic.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(cnv_gistic.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(cnv_gistic.columns[14:])}
""")

# CNV picnic.
start = time.time()
cnv_picnic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_PICNIC_FINAL_FILE_NAME}')
print(f"""CNV Picnic: {cnv_picnic.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(cnv_picnic.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(cnv_picnic.columns[14:])}
""")

Gene Expression: (446521, 922) took 4.90298 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 908

CNV Gistic: (446521, 952) took 106.16982 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 938

CNV Picnic: (446521, 980) took 117.35905 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 966

