In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!wget -c https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh
!chmod +x Anaconda3-5.1.0-Linux-x86_64.sh
!bash ./Anaconda3-5.1.0-Linux-x86_64.sh -b -f -p /usr/local

In [0]:
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages/')
!conda install -y -c rdkit rdkit;

In [0]:
!pip install git+https://github.com/samoturk/mol2vec;

In [0]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
import rdkit
from rdkit import Chem
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [0]:
data = pd.read_csv("/content/drive/My Drive/SMAI_Final_Assignment/Q3/train.csv")
y = data['Binding Affinity']
data.drop(columns='Binding Affinity',inplace=True)

In [0]:
data['molecules'] = data['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))
model = word2vec.Word2Vec.load('/content/drive/My Drive/SMAI_Final_Assignment/Q3/model_300dim.pkl')

In [0]:
data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['molecules'], 1)), axis=1)

data['mol2vec'] = [DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')]
train_data = [x.vec for x in data['mol2vec']] 
X = np.array(train_data)
# print(X)
# print(X.shape)

In [0]:
y = y.values
# print(y)
# print(y.shape)

# Testing different models by doing train-validation split

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80)

# Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
reg = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_lr = r2_score(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
mae_lr = mean_absolute_error(y_test, y_pred)
print("r2_score: ", r2_lr)
print("MSE: ", mse_lr)
print("MAE: ", mae_lr)

r2_score:  0.5750069991639465
MSE:  5.4877698298267115
MAE:  1.7480879979765147


# Ridge CV linear model

In [12]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
r2_ridge = r2_score(y_test, y_pred)
mse_ridge = mean_squared_error(y_test, y_pred)
mae_ridge = mean_absolute_error(y_test, y_pred)
print("r2_score: ", r2_ridge)
print("MSE: ", mse_ridge)
print("MAE: ", mae_ridge)

r2_score:  0.5810688490461846
MSE:  5.409495512764446
MAE:  1.7328159191306627


# Bayesian Ridge model

In [13]:
reg = linear_model.BayesianRidge()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
r2_score(y_test, y_pred)
r2_bayessian = r2_score(y_test, y_pred)
mse_bayessian = mean_squared_error(y_test, y_pred)
mae_bayessian = mean_absolute_error(y_test, y_pred)
print("r2_score: ", r2_bayessian)
print("MSE: ", mse_bayessian)
print("MAE: ", mae_bayessian)

r2_score:  0.5817218347453195
MSE:  5.401063761625083
MAE:  1.7307113600260875


# Support Vector Regression (SVR)

## Testing using different values of parameters for SVR

In [14]:
from sklearn.svm import SVR
clf = SVR(C = 10.0, epsilon = 1.0)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr1 = r2_score(y_test, y_pred_clf)
mse_svr1 = mean_squared_error(y_test, y_pred_clf)
mae_svr1 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr1)
print("MSE: ", mse_svr1)
print("MAE: ", mae_svr1)

r2_score:  0.5970496623104895
MSE:  5.203141467602705
MAE:  1.644531601370342


In [15]:
from sklearn.svm import SVR
clf = SVR(C = 20.0, epsilon = 1.0)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr2 = r2_score(y_test, y_pred_clf)
mse_svr2 = mean_squared_error(y_test, y_pred_clf)
mae_svr2 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr2)
print("MSE: ", mse_svr2)
print("MAE: ", mae_svr2)

r2_score:  0.6080960035569034
MSE:  5.060504346278851
MAE:  1.6252358000906777


In [16]:
from sklearn.svm import SVR
clf = SVR(C = 50.0, epsilon = 0.5)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr3 = r2_score(y_test, y_pred_clf)
mse_svr3 = mean_squared_error(y_test, y_pred_clf)
mae_svr3 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr3)
print("MSE: ", mse_svr3)
print("MAE: ", mae_svr3)

r2_score:  0.6197342917123341
MSE:  4.910223644044793
MAE:  1.5995100610516946


In [17]:
from sklearn.svm import SVR
clf = SVR(C = 50.0, epsilon = 1.0)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr4 = r2_score(y_test, y_pred_clf)
mse_svr4 = mean_squared_error(y_test, y_pred_clf)
mae_svr4= mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr4)
print("MSE: ", mse_svr4)
print("MAE: ", mae_svr4)

r2_score:  0.6175475691365911
MSE:  4.938459944769171
MAE:  1.6110817360703773


In [18]:
from sklearn.svm import SVR
clf = SVR(C = 100.0, epsilon = 1.0)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr5 = r2_score(y_test, y_pred_clf)
mse_svr5 = mean_squared_error(y_test, y_pred_clf)
mae_svr5 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr5)
print("MSE: ", mse_svr5)
print("MAE: ", mae_svr5)

r2_score:  0.621279926294635
MSE:  4.89026546399947
MAE:  1.6141310969788072


In [19]:
from sklearn.svm import SVR
clf = SVR(C = 100.0, epsilon = 1.5)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr6 = r2_score(y_test, y_pred_clf)
mse_svr6 = mean_squared_error(y_test, y_pred_clf)
mae_svr6 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr6)
print("MSE: ", mse_svr6)
print("MAE: ", mae_svr6)

r2_score:  0.6183957191297136
MSE:  4.927508112776013
MAE:  1.6350381198637078


In [20]:
from sklearn.svm import SVR
clf = SVR(C = 120.0, epsilon = 1.5)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr7 = r2_score(y_test, y_pred_clf)
mse_svr7 = mean_squared_error(y_test, y_pred_clf)
mae_svr7 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr7)
print("MSE: ", mse_svr7)
print("MAE: ", mae_svr7)

r2_score:  0.6179782699359777
MSE:  4.932898472349846
MAE:  1.6396363778654057


In [21]:
from sklearn.svm import SVR
clf = SVR(C = 150.0, epsilon = 1.0)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr8 = r2_score(y_test, y_pred_clf)
mse_svr8 = mean_squared_error(y_test, y_pred_clf)
mae_svr8 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr8)
print("MSE: ", mse_svr8)
print("MAE: ", mae_svr8)

r2_score:  0.6202793434294804
MSE:  4.903185602563717
MAE:  1.6314457809594343


In [22]:
from sklearn.svm import SVR
clf = SVR(C = 150.0, epsilon = 1.5)
clf.fit(X_train,y_train)
y_pred_clf = clf.predict(X_test)
r2_svr9 = r2_score(y_test, y_pred_clf)
mse_svr9 = mean_squared_error(y_test, y_pred_clf)
mae_svr9 = mean_absolute_error(y_test, y_pred_clf)
print("r2_score: ", r2_svr9)
print("MSE: ", mse_svr9)
print("MAE: ", mae_svr9)

r2_score:  0.6168225397920979
MSE:  4.947821967566304
MAE:  1.6466994694563037


# Comparison of the different models

In [23]:
lr = ["Linear Regression model", r2_lr, mse_lr, mae_lr]
ridge = ["Ridge CV model", r2_ridge, mse_ridge, mae_ridge]
bayessian = ["Bayessian Ridge model", r2_bayessian, mse_bayessian, mae_bayessian]
svr1 = ["SVR model", r2_svr1, mse_svr1, mae_svr1]
data = [lr, ridge, bayessian, svr1]
df1 = pd.DataFrame(data, columns = ['Model', 'r2 score', 'MSE', 'MAE'])
df1

Unnamed: 0,Model,r2 score,MSE,MAE
0,Linear Regression model,0.575007,5.48777,1.748088
1,Ridge CV model,0.581069,5.409496,1.732816
2,Bayessian Ridge model,0.581722,5.401064,1.730711
3,SVR model,0.59705,5.203141,1.644532


# Comparison of SVR models by taking different parameters

In [24]:
svr1 = ["C = 10, epsilon = 1.0", r2_svr1, mse_svr1, mae_svr1]
svr2 = ["C = 20, epsilon = 1.0", r2_svr2, mse_svr2, mae_svr2]
svr3 = ["C = 50, epsilon = 0.5", r2_svr3, mse_svr3, mae_svr3]
svr4 = ["C = 50, epsilon = 1.0", r2_svr4, mse_svr4, mae_svr4]
svr5 = ["C = 100, epsilon = 1.0", r2_svr5, mse_svr5, mae_svr5]
svr6 = ["C = 100, epsilon = 1.5", r2_svr6, mse_svr6, mae_svr6]
svr7 = ["C = 120, epsilon = 1.5", r2_svr7, mse_svr7, mae_svr7]
svr8 = ["C = 150, epsilon = 1.0", r2_svr8, mse_svr8, mae_svr8]
svr9 = ["C = 150, epsilon = 1.5", r2_svr9, mse_svr9, mae_svr9]

data = [svr1, svr2, svr3, svr4, svr5, svr6, svr7, svr8, svr9]
df1 = pd.DataFrame(data, columns = ['SVR Model', 'r2 score', 'MSE', 'MAE'])
df1

Unnamed: 0,SVR Model,r2 score,MSE,MAE
0,"C = 10, epsilon = 1.0",0.59705,5.203141,1.644532
1,"C = 20, epsilon = 1.0",0.608096,5.060504,1.625236
2,"C = 50, epsilon = 0.5",0.619734,4.910224,1.59951
3,"C = 50, epsilon = 1.0",0.617548,4.93846,1.611082
4,"C = 100, epsilon = 1.0",0.62128,4.890265,1.614131
5,"C = 100, epsilon = 1.5",0.618396,4.927508,1.635038
6,"C = 120, epsilon = 1.5",0.617978,4.932898,1.639636
7,"C = 150, epsilon = 1.0",0.620279,4.903186,1.631446
8,"C = 150, epsilon = 1.5",0.616823,4.947822,1.646699


# Test data

In [0]:
data1 = pd.read_csv("/content/drive/My Drive/SMAI_Final_Assignment/Q3/final_test.csv")
data2 = data1
data1.drop(columns='Binding Affinity',inplace=True)
# X_test = data1
# X_test
# data1

In [0]:
data1['molecules'] = data1['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))
model = word2vec.Word2Vec.load('/content/drive/My Drive/SMAI_Final_Assignment/Q3/model_300dim.pkl')
data1['sentence'] = data1.apply(lambda x: MolSentence(mol2alt_sentence(x['molecules'], 1)), axis=1)

data1['mol2vec'] = [DfVec(x) for x in sentences2vec(data1['sentence'], model, unseen='UNK')]
X_test = np.array([x.vec for x in data1['mol2vec']])

In [0]:
from sklearn.svm import SVR
clf = SVR(C = 100.0, epsilon = 1.0)
clf.fit(X,y)
y_pred_clf = clf.predict(X_test)

# Saving the predictions in csv file

In [0]:
data2 = pd.read_csv("/content/drive/My Drive/SMAI_Final_Assignment/Q3/final_test.csv")
data2.drop(columns='Binding Affinity',inplace=True)
# data2

In [0]:
dfObj = pd.DataFrame(y_pred_clf)
data2['Binding Affinity'] = y_pred_clf

In [0]:
import csv
data2.to_csv("submission.csv",index=None)

In [0]:
!cp submission.csv "drive/My Drive/smai"