In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/ML project/drug.csv")
df.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(882, 798)

In [None]:
X1=df.iloc[:,1:-1].values
Y1=df.iloc[:,-1].values
#Splitting the data into train and test
X_train,X_test,Y_train,Y_test = train_test_split(X1,Y1,test_size = 0.15, random_state=42)
#Scaling using Standard scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) 
#No.of training and testing records
print(len(X_train),len(X_test))


749 133


In [None]:
print(df['Inhibitor'].value_counts())
print(f'X_train shape : {X_train.shape}\nX_test shape :{X_test.shape}')

1    474
0    408
Name: Inhibitor, dtype: int64
X_train shape : (749, 796)
X_test shape :(133, 796)


## SMOTE - Dataset balancing

In [None]:
#Feature engineering using SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
import collections
from collections import Counter
#changing the dataset to balance out the inhibitors and non-inhibitors
oversample = SMOTE()
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X1, Y1)
#Train and test split
X_train,X_test,Y_train,Y_test = train_test_split(X_res,y_res,test_size = 0.15, random_state=42)
#Standardization and Normalization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#show the shape of train test and count of the y_trains
print(len(X_train),len(X_test))
print(df['Inhibitor'].value_counts())
print(f'X_train shape : {X_train.shape}\nX_test shape :{X_test.shape}')
print('Resampled dataset shape %s' % Counter(y_res))

805 143
1    474
0    408
Name: Inhibitor, dtype: int64
X_train shape : (805, 796)
X_test shape :(143, 796)
Resampled dataset shape Counter({0: 474, 1: 474})


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

#PCA

Remove Constant, Quasi Constant and Duplicate Features

In [None]:
#PCA
#remove constant and quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [None]:
X_train_filter.shape, X_test_filter.shape

((805, 795), (143, 795))

In [None]:
#remove duplicate features
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [None]:
X_train_T.duplicated().sum()
duplicated_features = X_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [None]:
scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)

In [None]:
X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)

In [None]:
X_train_unique.shape, X_test_unique.shape

((805, 787), (143, 787))

Removal of correlated Feature

In [None]:
corrmat = X_train_unique.corr()
#find correlated features
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )

correlated features:  632


In [None]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)

In [None]:
X_train_uncorr.shape, X_test_uncorr.shape

((805, 155), (143, 155))

Feature Dimension Reduction by LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_uncorr, Y_train)
X_test_lda = lda.transform(X_test_uncorr)

In [None]:
X_train_lda.shape, X_test_lda.shape

((805, 1), (143, 1))

In [None]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(y_pred)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [None]:
%%time
run_randomForest(X_train_lda, X_test_lda, Y_train, Y_test )

[0 0 1 0 1 0 0 1 1 1 0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1
 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0
 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 1 0 1 1]
Accuracy on test set: 
0.8041958041958042
CPU times: user 238 ms, sys: 38.3 ms, total: 277 ms
Wall time: 217 ms


In [None]:
%%time
run_randomForest(X_train_uncorr, X_test_uncorr, Y_train, Y_test)

[0 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1
 0 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 0 1
 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 1 1 0 1 0
 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 1 1]
Accuracy on test set: 
0.8531468531468531
CPU times: user 604 ms, sys: 22.9 ms, total: 627 ms
Wall time: 393 ms


In [None]:
%%time
run_randomForest(X_train, X_test, Y_train, Y_test)

[0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1
 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1
 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 0
 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1]
Accuracy on test set: 
0.8251748251748252
CPU times: user 1.36 s, sys: 15.4 ms, total: 1.38 s
Wall time: 750 ms


Feature Reduction by PCA?

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X_train_uncorr)

PCA(n_components=2, random_state=42)

In [None]:
X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)
X_train_pca.shape, X_test_pca.shape

((805, 2), (143, 2))

In [None]:
%%time
run_randomForest(X_train_pca, X_test_pca, Y_train, Y_test)

[1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1
 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1
 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1 1
 0 1 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1]
Accuracy on test set: 
0.7552447552447552
CPU times: user 284 ms, sys: 28.1 ms, total: 313 ms
Wall time: 231 ms


In [None]:
%%time
run_randomForest(X_train, X_test, Y_train, Y_test)

[0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1
 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1
 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 0
 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1]
Accuracy on test set: 
0.8251748251748252
CPU times: user 1.4 s, sys: 16 ms, total: 1.41 s
Wall time: 1.09 s


In [None]:
X_train_uncorr.shape

(805, 155)

In [None]:
for component in range(1,27):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, Y_train, Y_test)
    print()

Selected Components:  1
[0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0
 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
 1 0 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 1 1
 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 1]
Accuracy on test set: 
0.7132867132867133

Selected Components:  2
[1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1
 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1
 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1 1
 0 1 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1]
Accuracy on test set: 
0.7552447552447552

Selected Components:  3
[0 0 0 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 0 1
 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 1 1
 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 

In [None]:
from sklearn.ensemble import RandomForestClassifier
estimator =RandomForestClassifier(n_estimators=250)
from sklearn.feature_selection import RFE
sel_ = RFE(estimator,n_features_to_select=80)
sel_.fit(X_train_pca, Y_train)
print(sel_.get_support())
print(sel_.estimator_.feature_importances_)


[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
[0.06042107 0.04108953 0.03146382 0.06961194 0.05426307 0.07165647
 0.03902463 0.03323665 0.04556266 0.04234315 0.03136363 0.03200784
 0.03257371 0.02935081 0.03040285 0.03365696 0.03114237 0.03604264
 0.03978952 0.02733866 0.02390494 0.03099009 0.03174453 0.03596275
 0.0296205  0.03543522]


In [None]:
test=pd.read_csv('/content/drive/MyDrive/ML project/Output_descriptor_47000_removed.csv')
names=test['Name']
print(test.shape)
test=test.iloc[:,1:]
for col in test:
  mean=test[col].mean()
  test[col].fillna(value=mean, inplace=True)

(57514, 797)


In [None]:
#feature dimension reduction by LDA
%%time
LDA=run_randomForest(X_train_lda, X_test_lda, Y_train,Y_test)
#feature dimension reduction - uncorrelate
%%time
uncorrLDA=run_randomForest(X_train_uncorr, X_test_uncorr, Y_train)

[0 0 1 0 1 0 0 1 1 1 0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1
 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0
 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 1 0 1 1]
Accuracy on test set: 
0.8041958041958042


UsageError: Line magic function `%%time` not found.


In [None]:
def run_randomForest(X_train, X_test, Y_train, Y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(Y_test)
    result=clf.predict(test)
    print(y_pred)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
accuracies = []
sensitivities=[]
specificities=[]
precisions=[]
f1s=[]

def classification_report(model,X_test,Y_test):
  pred=model.predict(X_test)
  #Confusion matrix
  tn, fp, fn, tp = confusion_matrix(Y_test,pred,labels=(0,1)).ravel()
  sensitivity=tp/(tp+fn)#recall
  specificity=tn/(tn+fp)
  precision=tp/(fp+tp)
  f1=(2*precision*sensitivity)/(precision+sensitivity)
  return sensitivity,specificity,precision,f1
  


accuracy=sel_.score(X_test, Y_test)
accuracies.append(accuracy)
sensitivity,specificity,precision,f1=classification_report(sel_,X_test,Y_test)
sensitivities.append(sensitivity)
specificities.append(specificity)
precisions.append(precision)
f1s.append(f1)   

print("accuracies for 5 fold",*accuracies)
print("Standard Deviation of predicted accuracies:",np.std(accuracies))
print("accuracy of rf",np.mean(accuracies))
print("sensitivity of rf",np.mean(sensitivities))
print("specificity of rf",np.mean(specificities))
print("precision of rf",np.mean(precisions))
print("f1 of rf",np.mean(f1s))

accuracies for 5 fold 0.8251748251748252
Standard Deviation of predicted accuracies: 0.0
accuracy of rf 0.8251748251748252
sensitivity of rf 0.9014084507042254
specificity of rf 0.75
precision of rf 0.7804878048780488
f1 of rf 0.8366013071895425


In [None]:
test_result

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
df=pd.DataFrame({'Name':names, 'Inhibitor_Class':test_result})
df.to_csv("descriptors.csv", index=True)

In [None]:
df

Unnamed: 0,Name,Inhibitor_Class
0,ï»¿,0
1,ligprep_3.maegz:2,0
2,ligprep_3.maegz:3,1
3,ligprep_3.maegz:4,1
4,ligprep_3.maegz:5,0
...,...,...
57509,ligprep_3.maegz:47447,1
57510,ligprep_3.maegz:47448,0
57511,ligprep_3.maegz:47449,0
57512,ligprep_3.maegz:47450,1


## Prediction with PCA and randomforest

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("/content/drive/MyDrive/ML project/drug.csv")
df.shape

In [None]:
X1=df.iloc[:,1:-1].values
Y1=df.iloc[:,-1].values
#Feature engineering using SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
import collections
from collections import Counter
#changing the dataset to balance out the inhibitors and non-inhibitors
oversample = SMOTE()
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X1, Y1)
#Train and test split
#X_train,X_test,Y_train,Y_test = train_test_split(X_res,y_res,test_size = 0.15, random_state=42)
#Standardization and Normalization
X_train=X_res
Y_train=y_res
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)
#show the shape of train test and count of the y_trains
print(len(X_train))
print(df['Inhibitor'].value_counts())
print(f'X_train shape : {X_train.shape}')
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
X_test=pd.read_csv('/content/drive/MyDrive/ML project/Output_descriptor_47000_removed.csv')
names=X_test['Name']
print(X_test.shape)
X_test=X_test.iloc[:,1:]
for col in X_test:
  mean=X_test[col].mean()
  X_test[col].fillna(value=mean, inplace=True)
print(X_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
#PCA
#remove constant and quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)
X_train_filter.shape, X_test_filter.shape

In [None]:
#remove duplicate features
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)


X_train_T.duplicated().sum()
duplicated_features = X_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)


X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)


X_train_unique.shape, X_test_unique.shape

In [None]:
corrmat = X_train_unique.corr()
#find correlated features
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)
X_train_uncorr.shape, X_test_uncorr.shape

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_uncorr, Y_train)
X_test_lda = lda.transform(X_test_uncorr)
X_train_lda.shape, X_test_lda.shape

def run_randomForest(X_train, X_test, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred
    #print('Accuracy on test set: ')
    #print(accuracy_score(y_test, y_pred))

In [None]:
%%time
result_lda=run_randomForest(X_train_uncorr, X_test_uncorr, Y_train)

In [None]:
from sklearn.decomposition import PCA
for component in range(1,27):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    #print('Selected Components: ', component)
    if component == 26:
      result_pca=run_randomForest(X_train_pca, X_test_pca, Y_train)
      #print(result_pca)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
def classification_report(model,X_test,Y_test):
  pred=model.predict(X_test)
  #Confusion matrix
  tn, fp, fn, tp = confusion_matrix(Y_test,pred,labels=(0,1)).ravel()
  sensitivity=tp/(tp+fn)#recall
  specificity=tn/(tn+fp)
  precision=tp/(fp+tp)
  f1=(2*precision*sensitivity)/(precision+sensitivity)
  return sensitivity,specificity,precision,f1
  
accuracies = []
sensitivities=[]
specificities=[]
precisions=[]
f1s=[]

accuracy=sel_.score(X_test, Y_test)
accuracies.append(accuracy)
sensitivity,specificity,precision,f1=classification_report(sel_,X_test,Y_test)
sensitivities.append(sensitivity)
specificities.append(specificity)
precisions.append(precision)
f1s.append(f1)   

print("accuracies for 5 fold",*accuracies)
print("Standard Deviation of predicted accuracies:",np.std(accuracies))
print("accuracy of rf",np.mean(accuracies))
print("sensitivity of rf",np.mean(sensitivities))
print("specificity of rf",np.mean(specificities))
print("precision of rf",np.mean(precisions))
print("f1 of rf",np.mean(f1s))

In [None]:
df=pd.DataFrame({'Name':names, 'LDA':result_lda, 'PCA':result_pca})
df.to_csv("result.csv", index=True)