In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, \
roc_auc_score

In [2]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [3]:
drug_df = pd.read_csv('data/IC50.csv')

In [4]:
# for recording accuracy, mse, auc
result_df = pd.DataFrame({
    'drug_id': drug_df['drug_id'].unique(),
})
result_df['accuracy'] = np.nan
result_df['mse'] = np.nan
result_df['auc'] = np.nan

# Part 1. Regression & Classification
Quantize `log_IC50` into 3 bins: `sensitive, intermediate, resistant`. Call this column `sensitivity`.

I used `Elastic Net` and `K-Nearest Neighbors`.

In [5]:
for drug_id in drug_df['drug_id'].unique():
    log_IC50 = drug_df.loc[drug_df['drug_id'] == drug_id, 'log_IC50']
    # quantization
    sensitivity = pd.cut(log_IC50, 3, 
                         labels=['sensitive', 'intermediate', 'resistant'])
    drug_df.loc[drug_df['drug_id'] == drug_id, 
                'sensitivity'] = sensitivity

In [6]:
display(drug_df.head())
display(drug_df.sample(n=5))

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
0,1,683665,2.44,resistant
1,1,684055,3.34,resistant
2,1,684057,3.57,resistant
3,1,684059,3.19,resistant
4,1,684062,2.46,resistant


Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
116391,299,909747,3.63,resistant
185815,1072,946356,3.84,resistant
68574,222,907311,0.16,intermediate
199758,1218,909720,2.04,resistant
114798,295,1298539,1.91,intermediate


In [7]:
drug_df['predicted_log_IC50'] = np.nan
drug_df['predicted_sensitivity'] = np.nan

## Fit Models

In [8]:
for drug_id in drug_df['drug_id'].unique():
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))
    select_drug_df = ((drug_df['drug_id'] == drug_id) &
                      (drug_df['cell_line_id'].isin(final_idx)))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    
    # regression
    y = drug_df.loc[select_drug_df, 'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    drug_df.loc[select_drug_df, 'predicted_log_IC50'] = y_pred
    
    mse = mean_squared_error(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'mse'] = mse
    
    # classification
    sensitivity = drug_df.loc[select_drug_df, 'sensitivity']
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(sensitivity)
    
    knn = KNeighborsClassifier()
    y_pred = cross_val_predict(knn, X, y, cv=5)
    y_pred_proba = cross_val_predict(knn, X, y, cv=5,
                                     method='predict_proba')
    
    predicted_sensitivity = label_encoder.inverse_transform(y_pred)
    drug_df.loc[select_drug_df, 'predicted_sensitivity'] = predicted_sensitivity
    
    accuracy = accuracy_score(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'accuracy'] = accuracy
    
    auc = roc_auc_score(y, y_pred_proba, multi_class='ovo')
    result_df.loc[result_df['drug_id'] == drug_id, 'auc'] = auc
    
    print('Drug id: {}, MSE: {}, accuracy: {}, AUC: {}'.format(
        drug_id, mse, accuracy, auc))

Drug id: 1, MSE: 1.2434051506334174, accuracy: 0.5849056603773585, AUC: 0.589435836264589
Drug id: 3, MSE: 4.19385051817043, accuracy: 0.5234159779614325, AUC: 0.6317323829430367
Drug id: 5, MSE: 2.3277872671269546, accuracy: 0.6240601503759399, AUC: 0.7234948143064135
Drug id: 6, MSE: 0.8329708434006754, accuracy: 0.7125307125307125, AUC: 0.5429447135815061
Drug id: 9, MSE: 2.1374331692525126, accuracy: 0.5536159600997507, AUC: 0.5124224955337695
Drug id: 11, MSE: 3.3678002010541412, accuracy: 0.505, AUC: 0.5732153024253179
Drug id: 17, MSE: 0.8193400540655413, accuracy: 0.47103274559193953, AUC: 0.5953066840896492
Drug id: 29, MSE: 2.323111876283228, accuracy: 0.56575682382134, AUC: 0.7141350929059529
Drug id: 30, MSE: 1.6631254978630383, accuracy: 0.5420792079207921, AUC: 0.6836857984745309
Drug id: 32, MSE: 3.823082444544791, accuracy: 0.4873096446700508, AUC: 0.6326832821103854
Drug id: 34, MSE: 0.9748457098913061, accuracy: 0.7444717444717445, AUC: 0.6306809700226629
Drug id: 35,



Drug id: 186, MSE: 1.1820366032635583, accuracy: 0.6604651162790698, AUC: 0.6435956989149995
Drug id: 190, MSE: 5.065923143225398, accuracy: 0.4756944444444444, AUC: 0.5779890782747377
Drug id: 192, MSE: 0.5525295461928069, accuracy: 0.7520091848450058, AUC: 0.5640419039496908




Drug id: 193, MSE: 0.4918448207137059, accuracy: 0.8873239436619719, AUC: 0.5568364052900135
Drug id: 194, MSE: 2.4241078117378705, accuracy: 0.648491879350348, AUC: 0.6305937630954891
Drug id: 196, MSE: 3.124878976913285, accuracy: 0.6681270536692223, AUC: 0.6658915290268751
Drug id: 197, MSE: 0.7501786904180424, accuracy: 0.6036866359447005, AUC: 0.5536585777827084
Drug id: 199, MSE: 1.6364528287913862, accuracy: 0.6582278481012658, AUC: 0.554338852832036
Drug id: 200, MSE: 1.3919744355682466, accuracy: 0.6437571592210768, AUC: 0.5566340086059531
Drug id: 201, MSE: 3.3289054817939565, accuracy: 0.5149082568807339, AUC: 0.5664783130695529
Drug id: 202, MSE: 0.49377544009208924, accuracy: 0.7442660550458715, AUC: 0.5892729428703928
Drug id: 203, MSE: 1.0837987380551863, accuracy: 0.7045454545454546, AUC: 0.6864706173168349
Drug id: 204, MSE: 2.696675463941562, accuracy: 0.7333333333333333, AUC: 0.5769755855444911
Drug id: 205, MSE: 0.43781390823531996, accuracy: 0.7280606717226435, AUC



Drug id: 206, MSE: 0.7109354034720119, accuracy: 0.9089924160346695, AUC: 0.6988437147474008
Drug id: 207, MSE: 1.3422705089003597, accuracy: 0.6846950517836594, AUC: 0.5721817048465015
Drug id: 208, MSE: 2.6049390536350745, accuracy: 0.49945828819068255, AUC: 0.6434107066470726
Drug id: 211, MSE: 1.2149091497382398, accuracy: 0.7124324324324325, AUC: 0.6888591706657432
Drug id: 219, MSE: 2.9825329941023693, accuracy: 0.6076086956521739, AUC: 0.6974990477213465
Drug id: 221, MSE: 1.090716875126866, accuracy: 0.6345945945945946, AUC: 0.6869795721675116
Drug id: 222, MSE: 2.274600205460046, accuracy: 0.6225596529284165, AUC: 0.7593759067191886
Drug id: 223, MSE: 2.4155446518959645, accuracy: 0.5872156013001083, AUC: 0.702646229856691
Drug id: 224, MSE: 2.6597284895756883, accuracy: 0.6060935799782372, AUC: 0.6639541727283981
Drug id: 225, MSE: 2.518264767935114, accuracy: 0.5833333333333334, AUC: 0.6765684240162013
Drug id: 226, MSE: 2.9913101978999848, accuracy: 0.6283482142857143, AUC:



Drug id: 263, MSE: 0.6991983402320175, accuracy: 0.6590662323561346, AUC: 0.6489297669302073
Drug id: 265, MSE: 1.2119968301486044, accuracy: 0.5468409586056645, AUC: 0.7104253014691612
Drug id: 266, MSE: 0.29185088585316493, accuracy: 0.6634093376764386, AUC: 0.6111711736879354
Drug id: 268, MSE: 4.938611252726009, accuracy: 0.7390326209223848, AUC: 0.5905534570912488
Drug id: 269, MSE: 1.9319449476542858, accuracy: 0.5622270742358079, AUC: 0.6010669116802917




Drug id: 271, MSE: 0.7081455746460352, accuracy: 0.900871459694989, AUC: 0.77882062514679
Drug id: 272, MSE: 2.398431819863534, accuracy: 0.5997807017543859, AUC: 0.6096301365746746
Drug id: 273, MSE: 2.5720817898214947, accuracy: 0.7190265486725663, AUC: 0.6247563474106232
Drug id: 274, MSE: 3.0937377846164256, accuracy: 0.6529017857142857, AUC: 0.631022517579125
Drug id: 275, MSE: 1.9393195448320344, accuracy: 0.5527747551686616, AUC: 0.7173985476439827
Drug id: 276, MSE: 2.516474834028849, accuracy: 0.7281659388646288, AUC: 0.6812070053367522
Drug id: 277, MSE: 0.840576262742508, accuracy: 0.9642470205850487, AUC: 0.7787291436845009
Drug id: 279, MSE: 1.0509824881176648, accuracy: 0.5722041259500543, AUC: 0.7138455676139722
Drug id: 281, MSE: 1.1350850860237243, accuracy: 0.8673913043478261, AUC: 0.8006859990172593
Drug id: 282, MSE: 2.4948418486628006, accuracy: 0.5802603036876356, AUC: 0.6504720953689048
Drug id: 283, MSE: 2.829333019566411, accuracy: 0.5677139761646804, AUC: 0.62



Drug id: 298, MSE: 0.8616402758165468, accuracy: 0.8167028199566161, AUC: 0.6630409103508099
Drug id: 299, MSE: 4.142697529687634, accuracy: 0.559432933478735, AUC: 0.6649971247501676
Drug id: 300, MSE: 3.1463374806177185, accuracy: 0.5603917301414582, AUC: 0.6890372000165188
Drug id: 301, MSE: 2.8842613726713906, accuracy: 0.5384615384615384, AUC: 0.7065030112152898
Drug id: 302, MSE: 4.309938211769341, accuracy: 0.5415754923413567, AUC: 0.6766026091976821
Drug id: 303, MSE: 2.0951151830277186, accuracy: 0.5808903365906624, AUC: 0.7102475223937489
Drug id: 304, MSE: 1.4018267630999581, accuracy: 0.6141304347826086, AUC: 0.6379517963204404
Drug id: 305, MSE: 2.033607718755732, accuracy: 0.5602605863192183, AUC: 0.7127996496926657
Drug id: 306, MSE: 1.839523069791504, accuracy: 0.6836403033586133, AUC: 0.7043770046802958
Drug id: 308, MSE: 1.8637276213619984, accuracy: 0.8714596949891068, AUC: 0.6875165512805962
Drug id: 309, MSE: 1.4723994064721764, accuracy: 0.6351791530944625, AUC: 0



Drug id: 1029, MSE: 0.8360246249336912, accuracy: 0.9095182138660399, AUC: 0.5144927536231885
Drug id: 1030, MSE: 0.890590068936905, accuracy: 0.6129411764705882, AUC: 0.5568952948729259
Drug id: 1031, MSE: 2.8610302482166983, accuracy: 0.6345475910693302, AUC: 0.6504718155888867
Drug id: 1032, MSE: 1.6568051694144854, accuracy: 0.8143360752056404, AUC: 0.7125234870723552
Drug id: 1033, MSE: 0.6824335287628017, accuracy: 0.5938967136150235, AUC: 0.6424962020509454
Drug id: 1036, MSE: 1.4706378597765508, accuracy: 0.6294117647058823, AUC: 0.7458161202811663
Drug id: 1037, MSE: 1.4662138163591663, accuracy: 0.7403055229142186, AUC: 0.6470196734600692
Drug id: 1038, MSE: 0.8225351308839343, accuracy: 0.6367924528301887, AUC: 0.6031861650009799
Drug id: 1039, MSE: 0.5466482152113546, accuracy: 0.6140979689366786, AUC: 0.5708171189278541
Drug id: 1042, MSE: 0.7899940916616204, accuracy: 0.5410225921521997, AUC: 0.5109834135092176
Drug id: 1043, MSE: 0.5997150233074898, accuracy: 0.699646643



Drug id: 1143, MSE: 1.5238724126436267, accuracy: 0.5967741935483871, AUC: 0.560621064830831
Drug id: 1149, MSE: 1.2923477846512008, accuracy: 0.8364030335861322, AUC: 0.6194752855870946
Drug id: 1158, MSE: 0.8580297667778376, accuracy: 0.613682092555332, AUC: 0.6024926656854205
Drug id: 1161, MSE: 1.2278539049067494, accuracy: 0.5915492957746479, AUC: 0.6024805603752972
Drug id: 1164, MSE: 1.0689802941156532, accuracy: 0.4899598393574297, AUC: 0.5805140278824489
Drug id: 1166, MSE: 1.5244194268900757, accuracy: 0.5551102204408818, AUC: 0.5888319507798115
Drug id: 1170, MSE: 1.1392731276715293, accuracy: 0.7780244173140954, AUC: 0.5599514350284404
Drug id: 1175, MSE: 0.9605379231372597, accuracy: 0.5746753246753247, AUC: 0.6167156252348321
Drug id: 1192, MSE: 1.7990342927585983, accuracy: 0.6416309012875536, AUC: 0.6453218096167735
Drug id: 1194, MSE: 0.7711881418311776, accuracy: 0.6148068669527897, AUC: 0.6029592142432527
Drug id: 1199, MSE: 0.555940380072758, accuracy: 0.61051502145



Drug id: 1230, MSE: 0.5179806265857105, accuracy: 0.8038585209003215, AUC: 0.5213738785014725




Drug id: 1236, MSE: 1.7338424809659532, accuracy: 0.6556149732620321, AUC: 0.5623409635465019
Drug id: 1239, MSE: 1.522642775947801, accuracy: 0.6373902132998746, AUC: 0.5871126604241247
Drug id: 1241, MSE: 1.1233231134958082, accuracy: 0.6147986942328618, AUC: 0.6454272874153494
Drug id: 1242, MSE: 1.618182971472853, accuracy: 0.8402173913043478, AUC: 0.7302012671446764
Drug id: 1243, MSE: 0.852131470269239, accuracy: 0.5890557939914163, AUC: 0.6296529408343987
Drug id: 1248, MSE: 7.180333758012962, accuracy: 0.5580110497237569, AUC: 0.6900571329605203
Drug id: 1259, MSE: 2.717849185165862, accuracy: 0.6082698585418934, AUC: 0.6736471649916705
Drug id: 1261, MSE: 1.896544260355452, accuracy: 0.5984848484848485, AUC: 0.60777178895691
Drug id: 1262, MSE: 0.26901036599634526, accuracy: 0.7868852459016393, AUC: 0.6842629600463574
Drug id: 1264, MSE: 0.32176674021224205, accuracy: 0.6652078774617067, AUC: 0.5935987824508868
Drug id: 1268, MSE: 0.7881451449898345, accuracy: 0.64680390032502

In [9]:
drug_df.to_csv('data/drug_df.csv', index=False)
result_df.to_csv('data/result_df.csv', index=False)

See `results_analysis.ipynb` for the analysis report.

In [10]:
drug_df.sample(n=5)

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity,predicted_log_IC50,predicted_sensitivity
77550,235,949155,0.68,intermediate,1.285906,intermediate
190556,1143,905991,2.95,intermediate,3.284626,resistant
48795,185,909784,3.57,resistant,2.135765,resistant
173501,1049,1299067,1.98,intermediate,2.540685,resistant
72323,226,905948,4.29,resistant,2.572855,resistant


In [11]:
# top 10 and bottom 10 mse
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])

Unnamed: 0,drug_id,mse
249,1262,0.26901
120,266,0.291851
47,150,0.301788
250,1264,0.321767
33,91,0.381167
163,341,0.402388
261,1502,0.425869
88,205,0.437814
78,193,0.491845
85,202,0.493775


Unnamed: 0,drug_id,mse
246,1248,7.180334
43,135,6.31571
76,190,5.065923
121,268,4.938611
166,346,4.889768
16,51,4.356716
147,302,4.309938
164,344,4.27024
1,3,4.193851
144,299,4.142698


In [None]:
# top 10 and bottom 10
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])