In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, \
roc_auc_score

In [9]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [10]:
drug_df = pd.read_csv('data/drug_df_quantized.csv')

In [11]:
# for recording accuracy, mse, auc
result_df = pd.DataFrame({
    'drug_id': drug_df['drug_id'].unique(),
})
result_df['accuracy'] = np.nan
result_df['mse'] = np.nan
result_df['auc'] = np.nan

In [12]:
display(drug_df.head())
display(drug_df.sample(n=5))

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
0,1,683665,2.44,resistant
1,1,684055,3.34,resistant
2,1,684057,3.57,resistant
3,1,684059,3.19,resistant
4,1,684062,2.46,resistant


Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
35157,165,1298531,6.09,intermediate
84670,255,1333014,4.87,resistant
64160,207,1290907,1.7,intermediate
102849,279,930301,4.16,intermediate
188636,1129,946355,3.36,intermediate


# Part 1. Regression & Classification
Quantize `log_IC50` into 3 bins: `sensitive, intermediate, resistant`. Call this column `sensitivity`.

Reduce data dimensions using `PCA`. Use `n_components` that explains 80% of the variance.

I used `Elastic Net` and `K-Nearest Neighbors`.

In [7]:
drug_df['predicted_log_IC50'] = np.nan
drug_df['predicted_sensitivity'] = np.nan

## Fit Models

In [15]:
for drug_id in drug_df['drug_id'].unique():
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))
    select_drug_df = ((drug_df['drug_id'] == drug_id) &
                      (drug_df['cell_line_id'].isin(final_idx)))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    
    # PCA, keep n_components that explain 80% of the variance
    pca = PCA(n_components=0.8)
    X = pca.fit_transform(X)
    
    # regression
    y = drug_df.loc[select_drug_df, 'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    drug_df.loc[select_drug_df, 'predicted_log_IC50'] = y_pred
    
    mse = mean_squared_error(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'mse'] = mse
    
    # classification
    sensitivity = drug_df.loc[select_drug_df, 'sensitivity']
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(sensitivity)
    
    knn = KNeighborsClassifier()
    y_pred = cross_val_predict(knn, X, y, cv=5)
    y_pred_proba = cross_val_predict(knn, X, y, cv=5,
                                     method='predict_proba')
    
    predicted_sensitivity = label_encoder.inverse_transform(y_pred)
    drug_df.loc[select_drug_df, 'predicted_sensitivity'] = predicted_sensitivity
    
    accuracy = accuracy_score(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'accuracy'] = accuracy
    
    auc = roc_auc_score(y, y_pred_proba, multi_class='ovo')
    result_df.loc[result_df['drug_id'] == drug_id, 'auc'] = auc
    
    print('Drug id: {}, MSE: {}, accuracy: {}, AUC: {}'.format(
        drug_id, mse, accuracy, auc))

Drug id: 1, MSE: 1.1486096282775498, accuracy: 0.601078167115903, AUC: 0.5733642678674391
Drug id: 3, MSE: 4.485933707441746, accuracy: 0.5234159779614325, AUC: 0.6354409647387855
Drug id: 5, MSE: 2.273761038102191, accuracy: 0.6090225563909775, AUC: 0.7250340787767865
Drug id: 6, MSE: 0.8051275969950313, accuracy: 0.7076167076167076, AUC: 0.5345550739418664
Drug id: 9, MSE: 2.057611450853563, accuracy: 0.5760598503740648, AUC: 0.5053938554990415
Drug id: 11, MSE: 3.5238672371596995, accuracy: 0.5, AUC: 0.576338684930494
Drug id: 17, MSE: 0.7874540705920942, accuracy: 0.45591939546599497, AUC: 0.5987245844842661
Drug id: 29, MSE: 2.1051437528663177, accuracy: 0.5781637717121588, AUC: 0.7322242806758211
Drug id: 30, MSE: 1.6404685703367836, accuracy: 0.551980198019802, AUC: 0.6860652906427555
Drug id: 32, MSE: 3.8167783890852798, accuracy: 0.4746192893401015, AUC: 0.6214900008537158
Drug id: 34, MSE: 0.8933498979194839, accuracy: 0.7321867321867321, AUC: 0.6357868881379852
Drug id: 35, 



Drug id: 186, MSE: 1.090374165073956, accuracy: 0.6686046511627907, AUC: 0.637772287159344
Drug id: 190, MSE: 4.731712081209839, accuracy: 0.47337962962962965, AUC: 0.5752567673401968
Drug id: 192, MSE: 0.5171289545949117, accuracy: 0.7508610792192881, AUC: 0.5723677883001653




Drug id: 193, MSE: 0.4569985769511103, accuracy: 0.8905742145178764, AUC: 0.5660387378944081
Drug id: 194, MSE: 2.2514350376460732, accuracy: 0.6624129930394431, AUC: 0.6388298052723498
Drug id: 196, MSE: 2.9819189309065677, accuracy: 0.6637458926615553, AUC: 0.6731949515353582
Drug id: 197, MSE: 0.7160250584696403, accuracy: 0.6048387096774194, AUC: 0.5636377320769476
Drug id: 199, MSE: 1.534167791555643, accuracy: 0.6570771001150748, AUC: 0.589149238035382
Drug id: 200, MSE: 1.2923501725913882, accuracy: 0.6506300114547537, AUC: 0.5593288291079553
Drug id: 201, MSE: 3.1347906356052833, accuracy: 0.5458715596330275, AUC: 0.5771217115280999
Drug id: 202, MSE: 0.4783751741320024, accuracy: 0.75, AUC: 0.5848029671648373
Drug id: 203, MSE: 0.9839087140814653, accuracy: 0.70995670995671, AUC: 0.7053327118192604
Drug id: 204, MSE: 2.589950229746651, accuracy: 0.7333333333333333, AUC: 0.5780939902194845
Drug id: 205, MSE: 0.37431650921383003, accuracy: 0.7118093174431203, AUC: 0.764224370725



Drug id: 206, MSE: 0.6138550790681155, accuracy: 0.9068255687973997, AUC: 0.707956670426944
Drug id: 207, MSE: 1.235788438974562, accuracy: 0.6869965477560415, AUC: 0.578731030670021
Drug id: 208, MSE: 2.353623433629554, accuracy: 0.5222101841820151, AUC: 0.6511472700719814
Drug id: 211, MSE: 1.148685918199339, accuracy: 0.721081081081081, AUC: 0.6769694936427051
Drug id: 219, MSE: 2.7890716151635413, accuracy: 0.616304347826087, AUC: 0.7059267093734175
Drug id: 221, MSE: 0.9597455617342219, accuracy: 0.6497297297297298, AUC: 0.6939150109850488
Drug id: 222, MSE: 2.026722645535821, accuracy: 0.6127982646420824, AUC: 0.7537516469897962
Drug id: 223, MSE: 2.2501640242394245, accuracy: 0.5839653304442037, AUC: 0.7040809739579942
Drug id: 224, MSE: 2.456607010259275, accuracy: 0.602829162132753, AUC: 0.6755177253870434
Drug id: 225, MSE: 2.317881656963912, accuracy: 0.6006493506493507, AUC: 0.6837790014631057
Drug id: 226, MSE: 2.7281758742442004, accuracy: 0.6205357142857143, AUC: 0.69885



Drug id: 263, MSE: 0.6016018652081323, accuracy: 0.6590662323561346, AUC: 0.6537111859665462
Drug id: 265, MSE: 1.1203212481712546, accuracy: 0.5533769063180828, AUC: 0.7158106672404919
Drug id: 266, MSE: 0.23668810921775243, accuracy: 0.6612377850162866, AUC: 0.6218527093343763
Drug id: 268, MSE: 5.1194136698726425, accuracy: 0.7424071991001124, AUC: 0.5908683331399844
Drug id: 269, MSE: 1.835880281039735, accuracy: 0.5720524017467249, AUC: 0.6001914285269416




Drug id: 271, MSE: 0.5883616720894401, accuracy: 0.9041394335511983, AUC: 0.7695320078832624
Drug id: 272, MSE: 2.370324665130394, accuracy: 0.5997807017543859, AUC: 0.6046617582939174
Drug id: 273, MSE: 2.517114084266248, accuracy: 0.7201327433628318, AUC: 0.6349091042391747
Drug id: 274, MSE: 3.065481744588034, accuracy: 0.6473214285714286, AUC: 0.6429388112282092
Drug id: 275, MSE: 1.8306098877132562, accuracy: 0.5364526659412405, AUC: 0.7150942847613221
Drug id: 276, MSE: 2.44965291181339, accuracy: 0.7270742358078602, AUC: 0.6975279508507356
Drug id: 277, MSE: 0.7280475483710179, accuracy: 0.9631635969664138, AUC: 0.7477592388306674
Drug id: 279, MSE: 0.941072640241882, accuracy: 0.5895765472312704, AUC: 0.720838276322492
Drug id: 281, MSE: 1.015265279638709, accuracy: 0.8706521739130435, AUC: 0.8014580838093482
Drug id: 282, MSE: 2.3451430276264547, accuracy: 0.5845986984815619, AUC: 0.6587048730683024
Drug id: 283, MSE: 2.655814653958714, accuracy: 0.5763813651137595, AUC: 0.637



Drug id: 298, MSE: 0.7724588669233065, accuracy: 0.8167028199566161, AUC: 0.5995890079462532
Drug id: 299, MSE: 4.172586876692939, accuracy: 0.5659760087241004, AUC: 0.6755324237860599
Drug id: 300, MSE: 2.8930686522369196, accuracy: 0.5549510337323177, AUC: 0.6874354597185598
Drug id: 301, MSE: 2.6524266306332143, accuracy: 0.562296858071506, AUC: 0.703996076684973
Drug id: 302, MSE: 3.9215986087540373, accuracy: 0.5601750547045952, AUC: 0.6880588245479818
Drug id: 303, MSE: 1.9705640915262201, accuracy: 0.5787187839305103, AUC: 0.7215375717341126
Drug id: 304, MSE: 1.3434943896533458, accuracy: 0.6032608695652174, AUC: 0.6323671218217418
Drug id: 305, MSE: 1.8894562311958791, accuracy: 0.5624321389793703, AUC: 0.7096907472698879
Drug id: 306, MSE: 1.635636081683011, accuracy: 0.6868905742145178, AUC: 0.7112130134271554
Drug id: 308, MSE: 1.648230362794355, accuracy: 0.8725490196078431, AUC: 0.6820388655594648
Drug id: 309, MSE: 1.343634071324514, accuracy: 0.6395222584147665, AUC: 0.



Drug id: 1029, MSE: 0.8202640334366995, accuracy: 0.9048178613396005, AUC: 0.5232561376162111
Drug id: 1030, MSE: 0.7863684571909529, accuracy: 0.6164705882352941, AUC: 0.5626653667656601
Drug id: 1031, MSE: 2.700628727554397, accuracy: 0.6404230317273796, AUC: 0.6449895596761279
Drug id: 1032, MSE: 1.5192727969793418, accuracy: 0.8225616921269095, AUC: 0.7133172544437842
Drug id: 1033, MSE: 0.5959697345178846, accuracy: 0.596244131455399, AUC: 0.6504942293857011
Drug id: 1036, MSE: 1.3014875079038062, accuracy: 0.6270588235294118, AUC: 0.7518183082806081
Drug id: 1037, MSE: 1.2605225654034549, accuracy: 0.736780258519389, AUC: 0.6518181978387786
Drug id: 1038, MSE: 0.7492483528723161, accuracy: 0.6403301886792453, AUC: 0.5989721807684771
Drug id: 1039, MSE: 0.4838935826327731, accuracy: 0.6284348864994026, AUC: 0.5767494263115421
Drug id: 1042, MSE: 0.7957314325551788, accuracy: 0.525564803804994, AUC: 0.4955789348836461
Drug id: 1043, MSE: 0.5631708824203303, accuracy: 0.693757361601



Drug id: 1143, MSE: 1.2902183842626593, accuracy: 0.594758064516129, AUC: 0.5590542044327591
Drug id: 1149, MSE: 1.1658932695791304, accuracy: 0.8320693391115926, AUC: 0.6235829108002474
Drug id: 1158, MSE: 0.766256369480264, accuracy: 0.6096579476861167, AUC: 0.5912405814207573
Drug id: 1161, MSE: 1.2584134779249936, accuracy: 0.5734406438631791, AUC: 0.5908153931838142
Drug id: 1164, MSE: 0.9648691433148232, accuracy: 0.4919678714859438, AUC: 0.5884622200411674
Drug id: 1166, MSE: 1.6962929609439137, accuracy: 0.5591182364729459, AUC: 0.6059183555632542
Drug id: 1170, MSE: 0.9664937498961313, accuracy: 0.779134295227525, AUC: 0.551909741373377
Drug id: 1175, MSE: 0.8663224120623051, accuracy: 0.5844155844155844, AUC: 0.6270473204797865
Drug id: 1192, MSE: 1.6727742131343633, accuracy: 0.6545064377682404, AUC: 0.6401437567346616
Drug id: 1194, MSE: 0.7345904770185655, accuracy: 0.6180257510729614, AUC: 0.594503435200297
Drug id: 1199, MSE: 0.4961191406640132, accuracy: 0.6255364806866



Drug id: 1230, MSE: 0.4978042042820259, accuracy: 0.7984994640943194, AUC: 0.5203958632970345




Drug id: 1236, MSE: 1.6082416384893128, accuracy: 0.651336898395722, AUC: 0.5605912310018302
Drug id: 1239, MSE: 1.295412603504671, accuracy: 0.6235884567126725, AUC: 0.5878940821190812
Drug id: 1241, MSE: 1.0020063484079142, accuracy: 0.6137105549510338, AUC: 0.634002665028051
Drug id: 1242, MSE: 1.30241694552388, accuracy: 0.8336956521739131, AUC: 0.7421891168479032
Drug id: 1243, MSE: 0.7955847976002508, accuracy: 0.5879828326180258, AUC: 0.638531220913956
Drug id: 1248, MSE: 6.334077754686343, accuracy: 0.5701657458563536, AUC: 0.697458294120138
Drug id: 1259, MSE: 2.683423639545249, accuracy: 0.6169749727965179, AUC: 0.6744992847482923
Drug id: 1261, MSE: 1.6844593961071785, accuracy: 0.6125541125541125, AUC: 0.6085089585258375
Drug id: 1262, MSE: 0.22226279163462095, accuracy: 0.7868852459016393, AUC: 0.6868931517416262
Drug id: 1264, MSE: 0.27734422796081964, accuracy: 0.6608315098468271, AUC: 0.5934796650838344
Drug id: 1268, MSE: 0.7112605752724369, accuracy: 0.643553629469122

In [16]:
drug_df.to_csv('data/drug_df_pca.csv', index=False)
result_df.to_csv('data/result_df_pca.csv', index=False)

In [17]:
drug_df.sample(n=5)

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity,predicted_log_IC50,predicted_sensitivity
167100,1037,910545,1.56,intermediate,1.770903,resistant
46708,182,906865,-3.49,sensitive,-1.382633,sensitive
76077,231,687812,5.41,intermediate,5.369906,resistant
147986,1012,1298531,-0.25,sensitive,1.581076,intermediate
89712,262,753534,4.16,resistant,2.997505,resistant


In [18]:
# top 10 and bottom 10
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])

Unnamed: 0,drug_id,mse
249,1262,0.222263
120,266,0.236688
47,150,0.256308
250,1264,0.277344
33,91,0.335878
163,341,0.346542
88,205,0.374317
261,1502,0.384483
155,312,0.439273
183,1018,0.440379


Unnamed: 0,drug_id,mse
43,135,6.373095
246,1248,6.334078
121,268,5.119414
166,346,4.779838
76,190,4.731712
1,3,4.485934
16,51,4.451908
144,299,4.172587
164,344,3.965724
147,302,3.921599
