In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, roc_auc_score

In [3]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [4]:
drug_df = pd.read_csv('data/IC50.csv')

# Part 1. Predict Drug Response

## Regression

In [7]:
# include only cell lines that appear in both df
idx1 = drug_df.loc[drug_df['drug_id'] == 1, 'cell_line_id']
idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                        'cell_line_id']
final_idx = set(idx1).intersection(set(idx2))

X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
X.drop_duplicates(subset=['cell_line_id'])
X.drop(columns='cell_line_id', inplace=True)
y = drug_df.loc[(drug_df['drug_id'] == 1) &
                (drug_df['cell_line_id'].isin(final_idx)),
                'log_IC50']

In [8]:
drug_df['predicted_log_IC50'] = np.nan
drug_df[drug_df['drug_id'] == 1].head()

Unnamed: 0,drug_id,cell_line_id,log_IC50,predicted_log_IC50
0,1,683665,2.44,
1,1,684055,3.34,
2,1,684057,3.57,
3,1,684059,3.19,
4,1,684062,2.46,


In [None]:
elastic_net = ElasticNet()
y_pred = cross_val_predict(elastic_net, X, y, cv=5)

In [None]:
mean_squared_error(y, y_pred)

In [None]:
def get_mean_squared_error_for_drug_idx(drug_id):
    global drug_df, cell_line_df
    # include only cell lines that appear in both df
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    y = drug_df.loc[(drug_df['drug_id'] == drug_id) &
                    (drug_df['cell_line_id'].isin(final_idx)),
                    'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    return mean_squared_error(y, y_pred)

In [None]:
get_mean_squared_error_for_drug_idx(1)

In [None]:
get_mean_squared_error_for_drug_idx(1026)

## Classification

In [None]:
pd.cut(y, 3, labels=['sensitive', 'intermediate', 'resistant']).head()

# RACS

In [38]:
racs_df = pd.read_csv('data/racs_df.csv')

In [39]:
racs_df.head()

Unnamed: 0,cell_line_id,cancer_type,alteration_type,region_identifier
0,684681,LUAD,Amplification,cnaLUAD21 (FOXA2)
1,684681,LUAD,Amplification,cnaLUAD22 (ASXL1)
2,684681,LUAD,Amplification,cnaLUAD23 (ARFGAP1)
3,684681,LUAD,Amplification,cnaLUAD27 (MYC)
4,687448,SKCM,Deletion,cnaSKCM3


In [40]:
cancer_df = racs_df[['cell_line_id', 'cancer_type']].drop_duplicates()

In [45]:
pd.get_dummies(cancer_df['cancer_type'], prefix='cancer')

Unnamed: 0,cancer_BLCA,cancer_BRCA,cancer_CESC,cancer_COAD/READ,cancer_DLBC,cancer_ESCA,cancer_GBM,cancer_HNSC,cancer_KIRC,cancer_LAML,...,cancer_LUAD,cancer_LUSC,cancer_MESO,cancer_OV,cancer_PAAD,cancer_PRAD,cancer_SKCM,cancer_STAD,cancer_THCA,cancer_UCEC
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
joined = pd.merge(cell_line_df, cancer_df,
                  on='cell_line_id', how='left')

KeyError: 'cell_line_id'

In [None]:
joined[['cell_line_id', 'TSPAN6', 'cancer_type']].head()

In [41]:
idx = cancer_df['cell_line_id'].unique()
cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx), 'cancer_type'] = \
cancer_df['cancer_type']

In [42]:
cancer_df[cancer_df['cell_line_id'] == 1287381]

Unnamed: 0,cell_line_id,cancer_type
1888,1287381,LUAD


In [44]:
cell_line_df.loc[cell_line_df['cell_line_id'] == 1287381, 'cancer_type']

809    COAD/READ
Name: cancer_type, dtype: object

In [27]:
enc.fit_transform(racs_df.drop(columns='cell_line_id'))

array([[ 12.,   1., 288.],
       [ 12.,   0., 310.],
       [ 12.,   0., 296.],
       ...,
       [  1.,   0.,  66.],
       [  1.,   0.,  67.],
       [  1.,   0.,  68.]])

In [18]:
cell_line_df.loc[[0]]

Unnamed: 0,cell_line_id,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,LINC00526,PPY2,Unnamed: 17730,Unnamed: 17731,KRT18P55,Unnamed: 17733,POLRMTP1,UBL5P2,TBC1D3P5,Unnamed: 17737
0,683665,3.238273,2.982254,10.235491,4.856061,4.07887,9.116236,3.65859,6.145475,5.042464,...,5.866047,3.095716,3.502513,8.564318,3.274367,4.018073,3.056214,9.446305,3.530871,6.134269


In [None]:
le_cancer.transform()