# Context of the notebook 
Tasks performed: 
- Choose which algorithm is the best to perform feature selection 
- Training and predictions are performed on the 3 datasets after feature selection was done

In [1]:
# Basic python libraries
import os 
import patoolib
from glob import glob 
from multiprocessing.pool import Pool
import warnings
import struct
import pickle
warnings.filterwarnings('ignore')

# Biological libraries 
from biopandas.pdb import PandasPdb
from biopandas.mol2 import PandasMol2
import mol2vec
import prot2vec

# Regular DS libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# AI libraries 
import sklearn 

In [2]:
dat1 = "C:/Users/redha.cherif_artefac/GitHub_perso/Research_project/Input/training_data/train_dat1_baseline_301022.parquet"
dat2 = "C:/Users/redha.cherif_artefac/GitHub_perso/Research_project/Input/training_data/train_dat2_baseline_301022.parquet"
dat3 = "C:/Users/redha.cherif_artefac/GitHub_perso/Research_project/Input/training_data/train_dat3_baseline_301022.parquet"
test = "C:/Users/redha.cherif_artefac/GitHub_perso/Research_project/Input/test_data/test_set_f_301022.parquet"

train_dat1_baseline_301022 = pd.read_parquet(dat1)
train_dat2_baseline_301022 = pd.read_parquet(dat2)
train_dat3_baseline_301022 = pd.read_parquet(dat3)
test_set_f_301022 = pd.read_parquet(test)

In [3]:
from sklearn.metrics import mean_squared_error # to measure the capacity of the algo to predict from the train 
def rmse(y_actual_test,y_predicted_test):
    return np.sqrt(mean_squared_error(y_actual_test,y_predicted_test))

In [5]:
# explore the algorithm wrapped by RFE: https://machinelearningmastery.com/rfe-feature-selection-in-python/ 
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from matplotlib import pyplot

# get the dataset
def get_dataset(dataset):
    X = dataset.drop(['pdb_code','pKd_or_pKi'],axis=1)
    y = dataset['pKd_or_pKi']
    return X, y

# get a list of models to evaluate
def get_models():
	models = dict()
	# lr
	rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
	model = DecisionTreeRegressor()
	models['lr'] = Pipeline(steps=[('s',rfe),('m',model)])
	# cart
	rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=5)
	model = DecisionTreeRegressor()
	models['cart'] = Pipeline(steps=[('s',rfe),('m',model)])
	# rf
	rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=5)
	model = DecisionTreeRegressor()
	models['rf'] = Pipeline(steps=[('s',rfe),('m',model)])
	# gbm
	rfe = RFE(estimator=GradientBoostingRegressor(), n_features_to_select=5)
	model = DecisionTreeRegressor()
	models['gbm'] = Pipeline(steps=[('s',rfe),('m',model)])
	return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y, test):
	model_fit = model.fit(X,y)
	preds = model_fit.predict(test.drop(['pdb_code','pKd_or_pKi'],axis=1))
	scores = rmse(test['pKd_or_pKi'],preds)
	#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
	#scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

In [276]:
# define dataset
X, y = get_dataset(train_dat1_baseline_301022)
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X, y, test_set_f_301022)
	results.append(scores)
	names.append(name)
	print(f"{name},{round(scores,3)}")

lr,2.219
cart,2.808
rf,2.612
gbm,2.63


**We can see that linear regression is the best algorithm to perform feature selection as it has the lowest RMSE.**

In [6]:
# automatically select the number of features for RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFECV

# define dataset
datasets = [train_dat1_baseline_301022, train_dat2_baseline_301022, train_dat3_baseline_301022]
for i in range(len(datasets)):
    X, y = get_dataset(datasets[i])
    # create pipeline
    rfe = RFECV(estimator=LinearRegression())
    model = LinearRegression()
    pipeline = Pipeline(steps=[('s',rfe),('m',model)])
    # evaluate model
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(pipeline, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    # report performance
    print(f"Performance for dataset {i+1}")
    print(f"Score : {round(mean(n_scores),3)}, std: {round(std(n_scores),3)}")
    print()

Performance for dataset 1
Score : -2.16, std: 1.171

Performance for dataset 2
Score : -1.691, std: 0.065

Performance for dataset 3
Score : -2.115, std: 0.878



In [15]:
datasets = [train_dat1_baseline_301022, train_dat2_baseline_301022, train_dat3_baseline_301022]
selected_feats_dats = []
test_sets = []

for i in range(len(datasets)):
    X, y = get_dataset(datasets[i])
    # fit RFE
    rfe.fit(X, y)
    # Nb of features selected 
    print(f'Feature selection for dataset {i+1}:')
    print(f"Total nb of feature: {rfe.n_features_in_}")
    print(f"Nb of selected features: {rfe.n_features_}")
    print()
    # summarize all features
    #for i in range(X.shape[1]):
    #	print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

    # Get columns to keep and create new dataframe with those only
    selected_feats = rfe.get_support(indices=True)
    selected_feats = np.insert(selected_feats,0,0)
    selected_feats = np.insert(selected_feats,len(selected_feats),datasets[i].shape[1]-1)
    selected_feats = np.unique(selected_feats)
    selected_feats_dat = datasets[i].iloc[:,selected_feats]
    test_set = test_set_f_301022.iloc[:,selected_feats]
    selected_feats_dats.append(selected_feats_dat)
    test_sets.append(test_set)

Feature selection for dataset 1:
Total nb of feature: 132
Nb of selected features: 37

Feature selection for dataset 2:
Total nb of feature: 132
Nb of selected features: 78

Feature selection for dataset 3:
Total nb of feature: 132
Nb of selected features: 54



## Training

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error # to measure the capacity of the algo to predict from the train 
from scipy.stats import pearsonr #to measure the correlation between the predicted constants and the ones in the train set

In [21]:
def rmse(y_actual_test,y_predicted_test):
    return np.sqrt(mean_squared_error(y_actual_test,y_predicted_test))

def pearson(y_actual_train,y_predicted_train):
    return pearsonr(y_actual_train,y_predicted_train)

In [22]:
linReg1 = LinearRegression()
linReg2 = LinearRegression()
linReg3 = LinearRegression()

In [23]:
trainings_sets = [(selected_feats_dats[0].drop(['pdb_code','pKd_or_pKi'],axis=1),selected_feats_dats[0]['pKd_or_pKi']),(selected_feats_dats[1].drop(['pdb_code','pKd_or_pKi'],axis=1),selected_feats_dats[1]['pKd_or_pKi']),(selected_feats_dats[2].drop(['pdb_code','pKd_or_pKi'],axis=1),selected_feats_dats[2]['pKd_or_pKi'])]
linReg_fit1 = linReg1.fit(trainings_sets[0][0],trainings_sets[0][1])
linReg_fit2 = linReg2.fit(trainings_sets[1][0],trainings_sets[1][1])
linReg_fit3 = linReg3.fit(trainings_sets[2][0],trainings_sets[2][1])

In [24]:
dat1_preds = linReg_fit1.predict(test_sets[0].drop(['pdb_code','pKd_or_pKi'],axis=1))
dat2_preds = linReg_fit2.predict(test_sets[1].drop(['pdb_code','pKd_or_pKi'],axis=1))
dat3_preds = linReg_fit3.predict(test_sets[2].drop(['pdb_code','pKd_or_pKi'],axis=1))

In [25]:
dat1_rmse = rmse(test_sets[0]['pKd_or_pKi'],dat1_preds) 
dat2_rmse = rmse(test_sets[1]['pKd_or_pKi'],dat2_preds) 
dat3_rmse = rmse(test_sets[2]['pKd_or_pKi'],dat3_preds)
print(dat1_rmse)
print(dat2_rmse)
print(dat3_rmse)

4.335959980615138
20.835570990735324
19.05233989106812


In [26]:
dat1_preds_train = linReg_fit1.predict(trainings_sets[0][0])
dat2_preds_train = linReg_fit2.predict(trainings_sets[1][0])
dat3_preds_train = linReg_fit3.predict(trainings_sets[2][0])

dat1_pearson = pearson(trainings_sets[0][1],dat1_preds_train) 
dat2_pearson = pearson(trainings_sets[1][1],dat2_preds_train) 
dat3_pearson = pearson(trainings_sets[2][1],dat3_preds_train)
print(dat1_pearson)
print(dat2_pearson)
print(dat3_pearson)

(0.4610059561731793, 6.711643848970794e-119)
(0.5084079821666755, 2.9801876191512593e-156)
(0.49421772808297776, 2.589267817197755e-149)


In [27]:
dat1_preds = pd.DataFrame(dat1_preds,columns=['score'])
dat1_preds["#code"] = test_sets[0]['pdb_code']
dat1_preds = dat1_preds[['#code','score']] 

In [28]:
dat2_preds = pd.DataFrame(dat2_preds,columns=['score'])
dat2_preds["#code"] = test_sets[1]['pdb_code']
dat2_preds = dat2_preds[['#code','score']] 

In [29]:
dat3_preds = pd.DataFrame(dat3_preds,columns=['score'])
dat3_preds["#code"] = test_sets[2]['pdb_code']
dat3_preds = dat3_preds[['#code','score']] 

In [34]:
dat_preds_mean = pd.DataFrame(dat3_preds,columns=['score'])
dat_preds_mean["#code"] = test_sets[0]['pdb_code']
dat_preds_mean = dat_preds_mean[['#code','score']] 

dat_preds_mean['score'] = dat1_preds['score'] + dat2_preds['score'] + dat3_preds['score']
dat_preds_mean['score'] = dat_preds_mean['score'] / 3

In [35]:
dat1_preds.to_csv('feats_sel_1_preds_dat1_011122.dat',index=False)
dat2_preds.to_csv('feats_sel_1_preds_dat2_011122.dat',index=False)
dat3_preds.to_csv('feats_sel_1_preds_dat3_011122.dat',index=False)
dat_preds_mean.to_csv('feats_sel_1_preds_mean_011122.dat',index=False)

In [42]:
pickle.dump(linReg_fit3, open('linReg_dat3_011122.pkl', 'wb'))

In [51]:
test_sets[2].to_parquet('test_dat3_feats_sel1_011122.parquet')

In [48]:
test_sets[0]

Unnamed: 0,pdb_code,lig_I_atom_count,lig_C.1_atom_count,lig_C.2_atom_mean,lig_C.3_atom_mean,lig_H_atom_mean,lig_N.4_atom_mean,lig_N.am_atom_mean,lig_O.co2_atom_mean,lig_C.ar_atom_mean,...,lig_nb_mean_double_diff,lig_nb_triple_bonds,lig_mean_triple_bonds,lig_nb_mean_triple_diff,lig_nb_ar_bonds,lig_mean_ar_bonds,lig_nb_mean_ar_diff,lig_nb_am_bonds,lig_mean_am_bonds,pKd_or_pKi
0,1a30,0.0,0.0,0.102041,0.204082,0.469388,0.020408,0.040816,0.122449,0.000000,...,1.958333,0.0,0.0,0.0,6.0,0.125000,5.875000,2.0,0.041667,4.30
1,1bcu,0.0,0.0,0.000000,0.000000,0.407407,0.000000,0.000000,0.000000,0.481481,...,0.000000,0.0,0.0,0.0,16.0,0.551724,15.448276,0.0,0.000000,3.28
2,1bzc,0.0,0.0,0.069767,0.093023,0.325581,0.000000,0.046512,0.116279,0.232558,...,1.954545,0.0,0.0,0.0,16.0,0.363636,15.636364,2.0,0.045455,4.92
3,1c5z,0.0,0.0,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.333333,...,0.000000,0.0,0.0,0.0,8.0,0.444444,7.555556,0.0,0.000000,4.01
4,1e66,0.0,0.0,0.050000,0.175000,0.475000,0.000000,0.000000,0.000000,0.225000,...,0.976744,0.0,0.0,0.0,11.0,0.255814,10.744186,0.0,0.000000,9.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,5aba,0.0,0.0,0.000000,0.224490,0.530612,0.040816,0.000000,0.000000,0.122449,...,0.000000,0.0,0.0,0.0,6.0,0.117647,5.882353,0.0,0.000000,2.98
281,5c28,0.0,0.0,0.000000,0.181818,0.454545,0.000000,0.000000,0.000000,0.181818,...,0.000000,0.0,0.0,0.0,6.0,0.260870,5.739130,0.0,0.000000,5.66
282,5c2h,0.0,0.0,0.054545,0.127273,0.436364,0.000000,0.000000,0.000000,0.236364,...,1.965517,0.0,0.0,0.0,17.0,0.293103,16.706897,0.0,0.000000,11.09
283,5dwr,0.0,0.0,0.017857,0.125000,0.428571,0.017857,0.017857,0.000000,0.285714,...,0.983051,0.0,0.0,0.0,18.0,0.305085,17.694915,1.0,0.016949,11.22
