Drug Response Prediction (IC50)

In [None]:
#Loading Libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle

In [None]:
#Data loading and pre-processing
dataset = pd.read_csv("Input.csv")
dataset.shape
X = dataset.drop(columns=["pIC50",])
y = dataset["pIC50"]
print(X.shape)
print(y.shape)

In [None]:
#Viewing Features
%matplotlib inline
import matplotlib.pyplot as plt
X.hist(bins=50, figsize=(45,15))
plt.show()

In [None]:
#Spliting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Feature Selection Using Mutual Information
from sklearn.feature_selection import mutual_info_regression
# determine the mutual information
mutual_info = mutual_info_regression(X_train, y_train)
mutual_info
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)
from sklearn.feature_selection import SelectPercentile
## Selecting the top 20 percentile
selected_top_columns = SelectPercentile(mutual_info_regression, percentile=40)
selected_top_columns.fit(X_train, y_train)
selected = X.columns[selected_top_columns.get_support()]
X_train_new = X_train[[selected]]
X_test_new = X_test[[selected]]

In [None]:
#Scaling Data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train_new)
X_test_scaled = scale.fit_transform(X_test_new)
print(X_train_scaled.shape)
print(X_test_scaled.shape)

In [None]:
#Training Different Redressors using Cross Validation
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
from sklearn.model_selection import cross_val_score
scores_RMSE = cross_val_score(forest_reg, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=5)
scores_r2 = cross_val_score(forest_reg, X_train_scaled, y_train, scoring="r2", cv=5)
tree_rmse = np.sqrt(-scores_RMSE)
mse = -scores_RMSE

#print('mse: ', mse)
print('rmse: ', tree_rmse)
print('r2: ', scores_r2)


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
extra_trees_model = ExtraTreesRegressor(n_estimators=150, random_state=42)
scores_RMSE = cross_val_score(extra_trees_model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=5)
scores_r2 = cross_val_score(extra_trees_model, X_train_scaled, y_train, scoring="r2", cv=5)
tree_rmse = np.sqrt(-scores_RMSE)
mse = -scores_RMSE

#print('mse: ', mse)
print('rmse: ', tree_rmse)
print('r2: ', scores_r2)

In [None]:
from sklearn.linear_model import BayesianRidge
bayesian_ridge_model = BayesianRidge()
scores_RMSE = cross_val_score(bayesian_ridge_model, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
scores_r2 = cross_val_score(bayesian_ridge_model, X_train, y_train, scoring="r2", cv=3)
tree_rmse = np.sqrt(-scores_RMSE)
print(tree_rmse)
print(scores_r2)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
scores_RMSE = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
scores_r2 = cross_val_score(lin_reg, X_train, y_train, scoring="r2", cv=3)
tree_rmse = np.sqrt(-scores_RMSE)
print(tree_rmse)
print(scores_r2)

In [None]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1.0)
scores_RMSE = cross_val_score(ridge_model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=3)
scores_r2 = cross_val_score(ridge_model, X_train_scaled, y_train, scoring="r2", cv=3)
tree_rmse = np.sqrt(-scores_RMSE)
print(tree_rmse)
print(scores_r2)

In [None]:
from sklearn.svm import SVR
svr_model = SVR(kernel='rbf')
scores_RMSE = cross_val_score(svr_model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=5)
scores_r2 = cross_val_score(svr_model, X_train_scaled, y_train, scoring="r2", cv=5)
tree_rmse = np.sqrt(-scores_RMSE)
print(tree_rmse)
print(scores_r2)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
scores_RMSE = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
scores_r2 = cross_val_score(tree_reg, X_train, y_train, scoring="r2", cv=3)
tree_rmse = np.sqrt(-scores_RMSE)
print(tree_rmse)
print(scores_r2)

In [None]:
#Test on the best model
extra_trees_model.fit(X_train_scaled,y_train)
y_pred = extra_trees_model.predict(X_test_scaled)
final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)
print(final_rmse)
r2 = r2_score(y_test, y_pred)
print(r2)
print(y_test.ravel()[50:60])
print(y_pred[50:60])