In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, \
roc_auc_score

In [2]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [21]:
drug_df = pd.read_csv('data/drug_df_quantized.csv')

In [22]:
display(drug_df.head())
display(drug_df.sample(n=5))

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
0,1,683665,2.44,resistant
1,1,684055,3.34,resistant
2,1,684057,3.57,resistant
3,1,684059,3.19,resistant
4,1,684062,2.46,resistant


Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
67634,221,908139,4.04,intermediate
77911,238,687997,5.45,resistant
196657,1192,1240207,2.85,resistant
151739,1017,905936,2.39,intermediate
139839,1003,1240144,-2.47,intermediate


In [4]:
# for recording accuracy, mse, auc
result_df = pd.DataFrame({
    'drug_id': drug_df['drug_id'].unique(),
})
result_df['accuracy'] = np.nan
result_df['mse'] = np.nan
result_df['auc'] = np.nan

# Part 2. RACS

Start with using only `Cancer Type`.

Quantize `log_IC50` into 3 bins: `sensitive, intermediate, resistant`. Call this column `sensitivity`.

The setting is similar to Part 1 where I used `PCA(n_components=0.8)`, `Elastic Net` and `K-Nearest Neighbors`.

In [5]:
racs_df = pd.read_csv('data/racs_df.csv')

In [6]:
cancer_df = racs_df[['cell_line_id', 'cancer_type']].drop_duplicates()

In [13]:
cell_line_cancer_df = pd.merge(cell_line_df[['cell_line_id']], 
                               cancer_df, on='cell_line_id', how='left')

In [14]:
display(cell_line_cancer_df.head())
display(cell_line_cancer_df.sample(n=5))

Unnamed: 0,cell_line_id,cancer_type
0,683665,
1,683667,
2,684052,
3,684055,
4,684057,


Unnamed: 0,cell_line_id,cancer_type
255,906817,ESCA
523,909750,LGG
326,907064,
453,908472,LUAD
567,910569,COAD/READ


In [17]:
# one-hot encode the categorical variables
categorical = pd.get_dummies(cell_line_cancer_df, prefix='cancer')
categorical.sample(n=5)

Unnamed: 0,cell_line_id,cancer_BLCA,cancer_BRCA,cancer_CESC,cancer_COAD/READ,cancer_DLBC,cancer_ESCA,cancer_GBM,cancer_HNSC,cancer_KIRC,...,cancer_LUAD,cancer_LUSC,cancer_MESO,cancer_OV,cancer_PAAD,cancer_PRAD,cancer_SKCM,cancer_STAD,cancer_THCA,cancer_UCEC
245,906801,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
831,1290813,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
699,949164,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
571,910691,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
598,910904,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Fit Models

In [24]:
drug_df['predicted_log_IC50'] = np.nan
drug_df['predicted_sensitivity'] = np.nan

In [35]:
for idx, drug_id in enumerate(drug_df['drug_id'].unique()):
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))
    select_drug_df = ((drug_df['drug_id'] == drug_id) &
                      (drug_df['cell_line_id'].isin(final_idx)))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)]
    # if duplicated cell lines, keep the first one
    X = X.drop_duplicates(subset=['cell_line_id'])
    X.drop(columns='cell_line_id', inplace=True)
    
    # PCA, keep n_components that explain 80% of the variance
    pca = PCA(n_components=0.8)
    X = pca.fit_transform(X)
    
    # add categorical dummy variables
    X_categorical = categorical.loc[categorical['cell_line_id'].isin(final_idx)]
    X_categorical = X_categorical.drop_duplicates(subset=['cell_line_id'])
    
    # concat to get train matrix
    X = np.concatenate([X, X_categorical.to_numpy()], axis=1)
    
    # regression
    y = drug_df.loc[select_drug_df, 'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    drug_df.loc[select_drug_df, 'predicted_log_IC50'] = y_pred
    
    mse = mean_squared_error(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'mse'] = mse
    
    # classification
    sensitivity = drug_df.loc[select_drug_df, 'sensitivity']
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(sensitivity)
    
    knn = KNeighborsClassifier()
    y_pred = cross_val_predict(knn, X, y, cv=5)
    y_pred_proba = cross_val_predict(knn, X, y, cv=5,
                                     method='predict_proba')
    
    predicted_sensitivity = label_encoder.inverse_transform(y_pred)
    drug_df.loc[select_drug_df, 'predicted_sensitivity'] = predicted_sensitivity
    
    accuracy = accuracy_score(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'accuracy'] = accuracy
    
    auc = roc_auc_score(y, y_pred_proba, multi_class='ovo')
    result_df.loc[result_df['drug_id'] == drug_id, 'auc'] = auc
    
    print('''Model idx: {}, Drug id: {}, Train Dimension: {}, 
MSE: {}, accuracy: {}, AUC: {}'''.format(
        idx, drug_id, X.shape, mse, accuracy, auc))

Model idx: 0, Drug id: 1, Train Dimension: (371, 151), 
MSE: 1.1395215939837433, accuracy: 0.42318059299191374, AUC: 0.484676880554259
Model idx: 1, Drug id: 3, Train Dimension: (363, 151), 
MSE: 4.8794525036595635, accuracy: 0.209366391184573, AUC: 0.37005846812142207
Model idx: 2, Drug id: 5, Train Dimension: (399, 159), 
MSE: 2.3533020007369276, accuracy: 0.41604010025062654, AUC: 0.4272463395104095
Model idx: 3, Drug id: 6, Train Dimension: (407, 161), 
MSE: 0.8104631538753931, accuracy: 0.43734643734643736, AUC: 0.4351478837327894
Model idx: 4, Drug id: 9, Train Dimension: (401, 160), 
MSE: 2.0891157124229256, accuracy: 0.29925187032418954, AUC: 0.4320040131565966
Model idx: 5, Drug id: 11, Train Dimension: (400, 159), 
MSE: 3.5369769975379666, accuracy: 0.265, AUC: 0.4233971088435375
Model idx: 6, Drug id: 17, Train Dimension: (397, 159), 
MSE: 0.7921006315703119, accuracy: 0.3047858942065491, AUC: 0.44690165094358486
Model idx: 7, Drug id: 29, Train Dimension: (403, 160), 
MSE: 

Model idx: 61, Drug id: 167, Train Dimension: (871, 266), 
MSE: 1.399529357170949, accuracy: 0.4535017221584386, AUC: 0.42949159801413633
Model idx: 62, Drug id: 170, Train Dimension: (878, 266), 
MSE: 1.665169351973367, accuracy: 0.35990888382687924, AUC: 0.4435771584346531
Model idx: 63, Drug id: 171, Train Dimension: (878, 267), 
MSE: 0.9924593976107605, accuracy: 0.47494305239179957, AUC: 0.47494128181875217
Model idx: 64, Drug id: 172, Train Dimension: (873, 266), 
MSE: 0.9728309108003816, accuracy: 0.3150057273768614, AUC: 0.45556061582748225
Model idx: 65, Drug id: 173, Train Dimension: (869, 265), 
MSE: 1.3780790032681316, accuracy: 0.42692750287687, AUC: 0.48580044448478454
Model idx: 66, Drug id: 175, Train Dimension: (860, 263), 
MSE: 1.2262705308100188, accuracy: 0.2837209302325581, AUC: 0.47208489849528795
Model idx: 67, Drug id: 176, Train Dimension: (871, 266), 
MSE: 2.1002740822149426, accuracy: 0.3008036739380023, AUC: 0.48388615986504435
Model idx: 68, Drug id: 177, T



Model idx: 75, Drug id: 186, Train Dimension: (860, 264), 
MSE: 1.09196932341869, accuracy: 0.5569767441860465, AUC: 0.47509819759891037
Model idx: 76, Drug id: 190, Train Dimension: (864, 264), 
MSE: 4.735117427732725, accuracy: 0.33101851851851855, AUC: 0.47641021259629485
Model idx: 77, Drug id: 192, Train Dimension: (871, 266), 
MSE: 0.5187077511653695, accuracy: 0.2709529276693456, AUC: 0.5078159775905677




Model idx: 78, Drug id: 193, Train Dimension: (923, 274), 
MSE: 0.45896991324069153, accuracy: 0.581798483206934, AUC: 0.4125315005727377
Model idx: 79, Drug id: 194, Train Dimension: (862, 264), 
MSE: 2.2764780337040205, accuracy: 0.3225058004640371, AUC: 0.48766263840342217
Model idx: 80, Drug id: 196, Train Dimension: (913, 272), 
MSE: 2.8832721481079275, accuracy: 0.26506024096385544, AUC: 0.3982273071065637
Model idx: 81, Drug id: 197, Train Dimension: (868, 265), 
MSE: 0.7192525287304478, accuracy: 0.49193548387096775, AUC: 0.5019513054548747
Model idx: 82, Drug id: 199, Train Dimension: (869, 265), 
MSE: 1.5383414849379362, accuracy: 0.5535097813578826, AUC: 0.570247392187244
Model idx: 83, Drug id: 200, Train Dimension: (873, 266), 
MSE: 1.296679613858464, accuracy: 0.49713631156930127, AUC: 0.4921767781481072
Model idx: 84, Drug id: 201, Train Dimension: (872, 266), 
MSE: 3.1550025054260384, accuracy: 0.4896788990825688, AUC: 0.50013036755921
Model idx: 85, Drug id: 202, Train



Model idx: 89, Drug id: 206, Train Dimension: (923, 275), 
MSE: 0.6133938167548505, accuracy: 0.5687973997833152, AUC: 0.4504437381220972
Model idx: 90, Drug id: 207, Train Dimension: (869, 265), 
MSE: 1.2373729961401763, accuracy: 0.5293440736478712, AUC: 0.5126494356508382
Model idx: 91, Drug id: 208, Train Dimension: (923, 275), 
MSE: 2.3467895134906858, accuracy: 0.3640303358613218, AUC: 0.4750312163103219
Model idx: 92, Drug id: 211, Train Dimension: (925, 275), 
MSE: 1.1514005515082295, accuracy: 0.5005405405405405, AUC: 0.508858701701875
Model idx: 93, Drug id: 219, Train Dimension: (920, 274), 
MSE: 2.764915225410215, accuracy: 0.2826086956521739, AUC: 0.5163489450312325
Model idx: 94, Drug id: 221, Train Dimension: (925, 275), 
MSE: 0.9598585758317527, accuracy: 0.21513513513513513, AUC: 0.4396926926807943
Model idx: 95, Drug id: 222, Train Dimension: (922, 275), 
MSE: 2.0131267285122187, accuracy: 0.3080260303687636, AUC: 0.5009679363288781
Model idx: 96, Drug id: 223, Train 



Model idx: 118, Drug id: 263, Train Dimension: (921, 274), 
MSE: 0.584719461496858, accuracy: 0.4549402823018458, AUC: 0.42540562394339104
Model idx: 119, Drug id: 265, Train Dimension: (918, 273), 
MSE: 1.1222303862088212, accuracy: 0.32461873638344224, AUC: 0.46632176822527693
Model idx: 120, Drug id: 266, Train Dimension: (921, 274), 
MSE: 0.23832624258733037, accuracy: 0.5298588490770901, AUC: 0.4996981157112526
Model idx: 121, Drug id: 268, Train Dimension: (889, 269), 
MSE: 5.10200247592698, accuracy: 0.42069741282339707, AUC: 0.4626852957764565
Model idx: 122, Drug id: 269, Train Dimension: (916, 273), 
MSE: 1.8003235757113716, accuracy: 0.35043668122270744, AUC: 0.4558379328429157




Model idx: 123, Drug id: 271, Train Dimension: (918, 273), 
MSE: 0.5963860963835182, accuracy: 0.5250544662309368, AUC: 0.4335347547713139
Model idx: 124, Drug id: 272, Train Dimension: (912, 272), 
MSE: 2.351755674448027, accuracy: 0.32785087719298245, AUC: 0.4560267384893324
Model idx: 125, Drug id: 273, Train Dimension: (904, 271), 
MSE: 2.482541774643167, accuracy: 0.42035398230088494, AUC: 0.4668079989397456
Model idx: 126, Drug id: 274, Train Dimension: (896, 270), 
MSE: 3.0145866280752296, accuracy: 0.29575892857142855, AUC: 0.46735509832622935
Model idx: 127, Drug id: 275, Train Dimension: (919, 273), 
MSE: 1.7657725284680925, accuracy: 0.19586507072905332, AUC: 0.4044142187105761
Model idx: 128, Drug id: 276, Train Dimension: (916, 273), 
MSE: 2.4236308400507403, accuracy: 0.25, AUC: 0.4682442799531407
Model idx: 129, Drug id: 277, Train Dimension: (923, 274), 
MSE: 0.7351379828997919, accuracy: 0.819068255687974, AUC: 0.5854405106637249
Model idx: 130, Drug id: 279, Train Dim



Model idx: 143, Drug id: 298, Train Dimension: (922, 274), 
MSE: 0.7747805777074426, accuracy: 0.5509761388286334, AUC: 0.5248137305167434
Model idx: 144, Drug id: 299, Train Dimension: (917, 273), 
MSE: 4.157039470439795, accuracy: 0.3707742639040349, AUC: 0.49063715805266733
Model idx: 145, Drug id: 300, Train Dimension: (919, 273), 
MSE: 2.941124492831005, accuracy: 0.2959738846572361, AUC: 0.4428053891547295
Model idx: 146, Drug id: 301, Train Dimension: (923, 274), 
MSE: 2.633723228527604, accuracy: 0.2524377031419285, AUC: 0.4524698056759542
Model idx: 147, Drug id: 302, Train Dimension: (914, 273), 
MSE: 3.8912376397105564, accuracy: 0.2975929978118162, AUC: 0.46501003142987757
Model idx: 148, Drug id: 303, Train Dimension: (921, 274), 
MSE: 1.970228657520238, accuracy: 0.23235613463626492, AUC: 0.44292914500785385
Model idx: 149, Drug id: 304, Train Dimension: (920, 273), 
MSE: 1.3427092807251182, accuracy: 0.47282608695652173, AUC: 0.5047514502031998
Model idx: 150, Drug id: 3



Model idx: 193, Drug id: 1029, Train Dimension: (851, 259), 
MSE: 0.8135393972181258, accuracy: 0.782608695652174, AUC: 0.46655316174605826
Model idx: 194, Drug id: 1030, Train Dimension: (850, 259), 
MSE: 0.787384845651457, accuracy: 0.4235294117647059, AUC: 0.45978993305693594
Model idx: 195, Drug id: 1031, Train Dimension: (851, 259), 
MSE: 2.6980498219358706, accuracy: 0.44418331374853115, AUC: 0.4829151467342842
Model idx: 196, Drug id: 1032, Train Dimension: (851, 259), 
MSE: 1.5084231657803966, accuracy: 0.42303172737955347, AUC: 0.46070000292650426
Model idx: 197, Drug id: 1033, Train Dimension: (852, 260), 
MSE: 0.6069031621883808, accuracy: 0.3568075117370892, AUC: 0.47199674729816027
Model idx: 198, Drug id: 1036, Train Dimension: (850, 259), 
MSE: 1.3178855991766325, accuracy: 0.43411764705882355, AUC: 0.4455587692437315
Model idx: 199, Drug id: 1037, Train Dimension: (851, 259), 
MSE: 1.2630750177997323, accuracy: 0.4571092831962397, AUC: 0.49126508052238815
Model idx: 200



Model idx: 226, Drug id: 1143, Train Dimension: (496, 193), 
MSE: 1.2978781528983256, accuracy: 0.4596774193548387, AUC: 0.4758260972347148
Model idx: 227, Drug id: 1149, Train Dimension: (923, 272), 
MSE: 1.1666505126604512, accuracy: 0.4344528710725894, AUC: 0.4637439889638382
Model idx: 228, Drug id: 1158, Train Dimension: (497, 193), 
MSE: 0.7599272898226314, accuracy: 0.4164989939637827, AUC: 0.45406386699343454
Model idx: 229, Drug id: 1161, Train Dimension: (497, 193), 
MSE: 1.2618802119134882, accuracy: 0.30985915492957744, AUC: 0.4562818370713107
Model idx: 230, Drug id: 1164, Train Dimension: (498, 193), 
MSE: 0.9621807712202904, accuracy: 0.39759036144578314, AUC: 0.5048958108168634
Model idx: 231, Drug id: 1166, Train Dimension: (499, 194), 
MSE: 1.6798702063466717, accuracy: 0.3807615230460922, AUC: 0.45989584335764827
Model idx: 232, Drug id: 1170, Train Dimension: (901, 268), 
MSE: 0.9670070852309564, accuracy: 0.33851276359600446, AUC: 0.5094554613672299
Model idx: 233,



Model idx: 240, Drug id: 1230, Train Dimension: (933, 275), 
MSE: 0.48745314799147554, accuracy: 0.43837084673097537, AUC: 0.4065406479008287




Model idx: 241, Drug id: 1236, Train Dimension: (935, 275), 
MSE: 1.6355743633898419, accuracy: 0.41711229946524064, AUC: 0.47230356400802803
Model idx: 242, Drug id: 1239, Train Dimension: (797, 251), 
MSE: 1.2974973655384376, accuracy: 0.43036386449184444, AUC: 0.48612197362561665
Model idx: 243, Drug id: 1241, Train Dimension: (919, 271), 
MSE: 1.0005133680094629, accuracy: 0.3394994559303591, AUC: 0.4558433154212231
Model idx: 244, Drug id: 1242, Train Dimension: (920, 272), 
MSE: 1.3129601124915586, accuracy: 0.2782608695652174, AUC: 0.48241492728830027
Model idx: 245, Drug id: 1243, Train Dimension: (932, 274), 
MSE: 0.7921919191240471, accuracy: 0.37875536480686695, AUC: 0.43899297502028184
Model idx: 246, Drug id: 1248, Train Dimension: (905, 269), 
MSE: 6.3638031622412665, accuracy: 0.26519337016574585, AUC: 0.4402317711404285
Model idx: 247, Drug id: 1259, Train Dimension: (919, 272), 
MSE: 2.6958274487988882, accuracy: 0.35364526659412404, AUC: 0.49402164658389847
Model idx:

In [36]:
drug_df.to_csv('data/drug_df_racs.csv', index=False)
result_df.to_csv('data/result_df_racs.csv', index=False)

In [37]:
drug_df.sample(n=5)

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity,predicted_log_IC50,predicted_sensitivity
53267,194,907063,-2.14,intermediate,,
183512,1067,906838,2.96,intermediate,3.681913,intermediate
110298,291,906850,3.83,intermediate,3.78412,sensitive
46052,180,910695,-6.29,sensitive,-3.805694,intermediate
175900,1053,949090,1.46,intermediate,1.554461,sensitive


In [38]:
# top 10 and bottom 10
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])

Unnamed: 0,drug_id,mse
249,1262,0.223508
120,266,0.238326
47,150,0.257682
250,1264,0.285128
33,91,0.332634
163,341,0.349962
88,205,0.378678
261,1502,0.39421
155,312,0.438978
183,1018,0.450332


Unnamed: 0,drug_id,mse
43,135,6.376384
246,1248,6.363803
121,268,5.102002
1,3,4.879453
166,346,4.736399
76,190,4.735117
16,51,4.572035
144,299,4.157039
164,344,3.978235
21,56,3.930705
