In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, roc_auc_score

In [3]:
cell_line_df = pd.read_csv('data/cell_line.csv')

In [5]:
drug_df = pd.read_csv('data/IC50.csv')

# Part 1. Predict Drug Response

## Regression

In [39]:
# include only cell lines that appear in both df
idx1 = drug_df.loc[drug_df['drug_id'] == 1, 'cell_line_id']
idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                        'cell_line_id']
final_idx = set(idx1).intersection(set(idx2))

X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
X.drop_duplicates(subset=['cell_line_id'])
X.drop(columns='cell_line_id', inplace=True)
y = drug_df.loc[(drug_df['drug_id'] == 1) &
                (drug_df['cell_line_id'].isin(final_idx)),
                'log_IC50']

In [35]:
drug_df['predicted_log_IC50'] = np.nan
drug_df[drug_df['drug_id'] == 1].head()

Unnamed: 0,drug_id,cell_line_id,log_IC50,predicted_log_IC50,mean_squared_error
0,1,683665,2.44,,
1,1,684055,3.34,,
2,1,684057,3.57,,
3,1,684059,3.19,,
4,1,684062,2.46,,


In [None]:
criteria = (drug_df['drug_id'] == 1) &
                (drug_df['cell_line_id'].isin(final_idx))
drug_df.loc[criteria,
           'predicted_log_IC50'] = y_pred

In [19]:
elastic_net = ElasticNet()
y_pred = cross_val_predict(elastic_net, X, y, cv=5)

In [22]:
mean_squared_error(y, y_pred)

1.2434051506334174

In [25]:
def get_mean_squared_error_for_drug_idx(drug_id):
    global drug_df, cell_line_df
    # include only cell lines that appear in both df
    idx1 = drug_df.loc[drug_df['drug_id'] == drug_id, 'cell_line_id']
    idx2 = cell_line_df.loc[cell_line_df['cell_line_id'].isin(idx1), 
                            'cell_line_id']
    final_idx = set(idx1).intersection(set(idx2))

    X = cell_line_df.loc[cell_line_df['cell_line_id'].isin(final_idx)].copy()
    # if duplicated cell lines, keep the first one
    X.drop_duplicates(subset=['cell_line_id'], inplace=True)
    X.drop(columns='cell_line_id', inplace=True)
    y = drug_df.loc[(drug_df['drug_id'] == drug_id) &
                    (drug_df['cell_line_id'].isin(final_idx)),
                    'log_IC50']
    
    elastic_net = ElasticNet()
    y_pred = cross_val_predict(elastic_net, X, y, cv=5)
    return mean_squared_error(y, y_pred)

In [26]:
get_mean_squared_error_for_drug_idx(1)

1.2434051506334174

In [28]:
get_mean_squared_error_for_drug_idx(1026)

2.1610472600051938

## Classification

In [41]:
pd.cut(y, 3, labels=['sensitive', 'intermediate', 'resistant'])

0         resistant
1         resistant
2         resistant
3         resistant
4         resistant
5      intermediate
6         resistant
7         resistant
8         resistant
9         resistant
10        resistant
11        resistant
12        sensitive
13        resistant
14     intermediate
15        resistant
16        resistant
18        resistant
19        resistant
20        resistant
21        resistant
22        resistant
23     intermediate
24        resistant
26        resistant
27        resistant
28        resistant
29        resistant
30     intermediate
31        resistant
           ...     
353       resistant
354    intermediate
355    intermediate
356    intermediate
357       resistant
358       resistant
359       resistant
360    intermediate
361       resistant
364       resistant
367       resistant
368    intermediate
369    intermediate
370    intermediate
371    intermediate
373       resistant
374       resistant
375       resistant
376    intermediate
