In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, \
roc_auc_score

In [9]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [10]:
drug_df = pd.read_csv('data/drug_df_quantized.csv')

In [11]:
# for recording accuracy, mse, auc
result_df = pd.DataFrame({
    'drug_id': drug_df['drug_id'].unique(),
})
result_df['accuracy'] = np.nan
result_df['mse'] = np.nan
result_df['auc'] = np.nan

In [12]:
display(drug_df.head())
display(drug_df.sample(n=5))

Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
0,1,683665,2.44,resistant
1,1,684055,3.34,resistant
2,1,684057,3.57,resistant
3,1,684059,3.19,resistant
4,1,684062,2.46,resistant


Unnamed: 0,drug_id,cell_line_id,log_IC50,sensitivity
35157,165,1298531,6.09,intermediate
84670,255,1333014,4.87,resistant
64160,207,1290907,1.7,intermediate
102849,279,930301,4.16,intermediate
188636,1129,946355,3.36,intermediate


# Part 1. Regression & Classification
Quantize `log_IC50` into 3 bins: `sensitive, intermediate, resistant`. Call this column `sensitivity`.

Reduce data dimensions using `PCA`. Use `n_components` that explains 80% of the variance.

I used `Elastic Net` and `K-Nearest Neighbors`.

In [7]:
drug_df['predicted_log_IC50'] = np.nan
drug_df['predicted_sensitivity'] = np.nan

## Fit Models

In [None]:
for drug_id in drug_df['drug_id'].unique():
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))
    select_drug_df = ((drug_df['drug_id'] == drug_id) &
                      (drug_df['cell_line_id'].isin(final_idx)))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    
    # PCA, keep n_components that explain 80% of the variance
    pca = PCA(n_components=0.8)
    X = pca.fit_transform(X)
    
    # regression
    y = drug_df.loc[select_drug_df, 'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    drug_df.loc[select_drug_df, 'predicted_log_IC50'] = y_pred
    
    mse = mean_squared_error(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'mse'] = mse
    
    # classification
    sensitivity = drug_df.loc[select_drug_df, 'sensitivity']
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(sensitivity)
    
    knn = KNeighborsClassifier()
    y_pred = cross_val_predict(knn, X, y, cv=5)
    y_pred_proba = cross_val_predict(knn, X, y, cv=5,
                                     method='predict_proba')
    
    predicted_sensitivity = label_encoder.inverse_transform(y_pred)
    drug_df.loc[select_drug_df, 'predicted_sensitivity'] = predicted_sensitivity
    
    accuracy = accuracy_score(y, y_pred)
    result_df.loc[result_df['drug_id'] == drug_id, 'accuracy'] = accuracy
    
    auc = roc_auc_score(y, y_pred_proba, multi_class='ovo')
    result_df.loc[result_df['drug_id'] == drug_id, 'auc'] = auc
    
    print('Drug id: {}, MSE: {}, accuracy: {}, AUC: {}'.format(
        drug_id, mse, accuracy, auc))

Drug id: 1, MSE: 1.1486096282775498, accuracy: 0.601078167115903, AUC: 0.5733642678674391
Drug id: 3, MSE: 4.485933707441746, accuracy: 0.5234159779614325, AUC: 0.6354409647387855
Drug id: 5, MSE: 2.273761038102191, accuracy: 0.6090225563909775, AUC: 0.7250340787767865
Drug id: 6, MSE: 0.8051275969950313, accuracy: 0.7076167076167076, AUC: 0.5345550739418664
Drug id: 9, MSE: 2.057611450853563, accuracy: 0.5760598503740648, AUC: 0.5053938554990415
Drug id: 11, MSE: 3.5238672371596995, accuracy: 0.5, AUC: 0.576338684930494
Drug id: 17, MSE: 0.7874540705920942, accuracy: 0.45591939546599497, AUC: 0.5987245844842661
Drug id: 29, MSE: 2.1051437528663177, accuracy: 0.5781637717121588, AUC: 0.7322242806758211
Drug id: 30, MSE: 1.6404685703367836, accuracy: 0.551980198019802, AUC: 0.6860652906427555
Drug id: 32, MSE: 3.8167783890852798, accuracy: 0.4746192893401015, AUC: 0.6214900008537158
Drug id: 34, MSE: 0.8933498979194839, accuracy: 0.7321867321867321, AUC: 0.6357868881379852
Drug id: 35, 



Drug id: 186, MSE: 1.090374165073956, accuracy: 0.6686046511627907, AUC: 0.637772287159344
Drug id: 190, MSE: 4.731712081209839, accuracy: 0.47337962962962965, AUC: 0.5752567673401968
Drug id: 192, MSE: 0.5171289545949117, accuracy: 0.7508610792192881, AUC: 0.5723677883001653




Drug id: 193, MSE: 0.4569985769511103, accuracy: 0.8905742145178764, AUC: 0.5660387378944081
Drug id: 194, MSE: 2.2514350376460732, accuracy: 0.6624129930394431, AUC: 0.6388298052723498
Drug id: 196, MSE: 2.9819189309065677, accuracy: 0.6637458926615553, AUC: 0.6731949515353582
Drug id: 197, MSE: 0.7160250584696403, accuracy: 0.6048387096774194, AUC: 0.5636377320769476
Drug id: 199, MSE: 1.534167791555643, accuracy: 0.6570771001150748, AUC: 0.589149238035382
Drug id: 200, MSE: 1.2923501725913882, accuracy: 0.6506300114547537, AUC: 0.5593288291079553
Drug id: 201, MSE: 3.1347906356052833, accuracy: 0.5458715596330275, AUC: 0.5771217115280999
Drug id: 202, MSE: 0.4783751741320024, accuracy: 0.75, AUC: 0.5848029671648373
Drug id: 203, MSE: 0.9839087140814653, accuracy: 0.70995670995671, AUC: 0.7053327118192604
Drug id: 204, MSE: 2.589950229746651, accuracy: 0.7333333333333333, AUC: 0.5780939902194845


In [None]:
drug_df.to_csv('data/drug_df_pca.csv', index=False)
result_df.to_csv('data/result_df_pca.csv', index=False)

In [None]:
drug_df.sample(n=5)

In [None]:
# top 10 and bottom 10
display(result_df.nsmallest(10, 'mse')[['drug_id', 'mse']])
display(result_df.nlargest(10, 'mse')[['drug_id', 'mse']])