In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, \
roc_auc_score

In [2]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [34]:
drug_df = pd.read_csv('data/IC50.csv')

In [35]:
# for recording accuracy, mse, auc
result_df = pd.DataFrame({
    'drug_id': drug_df['drug_id'].unique(),
})
result_df['accuracy'] = np.nan
result_df['mse'] = np.nan
result_df['auc'] = np.nan

# Part 1. Regression & Classification
Quantize `log_IC50` into 3 bins: `sensitive, intermediate, resistant`. Call this column `sensitivity`.

I used `Elastic Net` and `K-Nearest Neighbors`.

In [36]:
for drug_id in drug_df['drug_id'].unique():
    log_IC50 = drug_df.loc[drug_df['drug_id'] == drug_id, 'log_IC50']
    # quantization
    sensitivity = pd.cut(log_IC50, 3, 
                         labels=['sensitive', 'intermediate', 'resistant'])
    drug_df.loc[drug_df['drug_id'] == drug_id, 
                'sensitivity'] = sensitivity

In [37]:
display(drug_df.head())
display(drug_df.sample(n=5))

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
0,1,683665,2.44,resistant
1,1,684055,3.34,resistant
2,1,684057,3.57,resistant
3,1,684059,3.19,resistant
4,1,684062,2.46,resistant


Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
185956,1072,1290768,4.56,resistant
203727,1239,971773,0.11,sensitive
47677,184,907175,3.48,resistant
69966,223,1240208,2.24,resistant
189223,1133,906838,2.46,intermediate


In [38]:
drug_df['predicted_log_IC50'] = np.nan
drug_df['predicted_sensitivity'] = np.nan

## Fit Models

In [None]:
for drug_id in drug_df['drug_id'].unique():
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))
    select_drug_df = ((drug_df['drug_id'] == drug_id) &
                      (drug_df['cell_line_id'].isin(final_idx)))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    
    # regression
    y = drug_df.loc[select_drug_df, 'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    drug_df.loc[select_drug_df, 'predicted_log_IC50'] = y_pred
    
    mse = mean_squared_error(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'mse'] = mse
    
    # classification
    sensitivity = drug_df.loc[select_drug_df, 'sensitivity']
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(sensitivity)
    
    knn = KNeighborsClassifier()
    y_pred = cross_val_predict(knn, X, y, cv=5)
    y_pred_proba = cross_val_predict(knn, X, y, cv=5,
                                     method='predict_proba')
    
    predicted_sensitivity = label_encoder.inverse_transform(y_pred)
    drug_df.loc[select_drug_df, 'predicted_sensitivity'] = predicted_sensitivity
    
    accuracy = accuracy_score(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'accuracy'] = accuracy
    
    auc = roc_auc_score(y, y_pred_proba, multi_class='ovo')
    result_df.loc[result_df['drug_id'] == drug_id, 'auc'] = auc
    
    print('Drug id: {}, MSE: {}, accuracy: {}, AUC: {}'.format(
        drug_id, mse, accuracy, auc))

Drug id: 1, MSE: 1.2434051506334174, accuracy: 0.5849056603773585, AUC: 0.589435836264589
Drug id: 3, MSE: 4.19385051817043, accuracy: 0.5234159779614325, AUC: 0.6317323829430367
Drug id: 5, MSE: 2.3277872671269546, accuracy: 0.6240601503759399, AUC: 0.7234948143064135
Drug id: 6, MSE: 0.8329708434006754, accuracy: 0.7125307125307125, AUC: 0.5429447135815061
Drug id: 9, MSE: 2.1374331692525126, accuracy: 0.5536159600997507, AUC: 0.5124224955337695
Drug id: 11, MSE: 3.3678002010541412, accuracy: 0.505, AUC: 0.5732153024253179
Drug id: 17, MSE: 0.8193400540655413, accuracy: 0.47103274559193953, AUC: 0.5953066840896492


In [None]:
drug_df.to_csv('data/drug_df.csv', index=False)
result_df.to_csv('data/result_df.csv', index=False)

In [None]:
drug_df.sample(n=5)

In [None]:
# top 10 and bottom 10
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])