In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, MolSentence
model = word2vec.Word2Vec.load('model_300dim.pkl')

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, Parallel
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model as lin
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from catboost import CatBoostRegressor

from tqdm import trange, tqdm

In [57]:
dfs = pd.read_excel('1400.xlsx')
dfb = pd.read_excel('35000.xlsx')
dfs.head()

Unnamed: 0,Title,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,Molecular weight,Hydrogen bond acceptors,Hydrogen bond donors,Polar SA,SMILES,Pictures
0,1007-Ya-213,2.7,500.0,185.185185,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,50.0
1,1007-Ya-213,0.7,447.0,638.571429,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,51.0
2,1008-Ya-187,9.9,144.0,14.545455,250.431,1,0,15.6,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,52.0
3,1009-As-106,8.3,500.0,60.240964,222.377,1,0,15.6,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,53.0
4,1010-Ya-208,39.4,143.0,3.629442,239.361,2,0,29.54,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,54.0


In [2]:
def sentences2vec(sentences, model, unseen=None):
    keys = set(model.wv.index_to_key)
    vec = []
    if unseen:
        unseen_vec = model.wv.word_vec(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.word_vec(y) for y in sentence
                            if y in set(sentence) & keys]))
    return np.array(vec)

In [4]:
dfs['mol'] = dfs['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
dfb['mol'] = dfb['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# Calculate Morgan fingerprint
RADIUS = 1
dfs['sentence'] = dfs.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], RADIUS)), axis=1)
X = sentences2vec(dfs['sentence'], model, unseen='UNK')
dfb['sentence'] = dfb.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], RADIUS)), axis=1)
X2 = sentences2vec(dfb['sentence'], model, unseen='UNK')

[00:10:10] Conflicting single bond directions around double bond at index 55.
[00:10:10]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:10:10] Conflicting single bond directions around double bond at index 55.
[00:10:10]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:10:16] Conflicting single bond directions around double bond at index 7.
[00:10:16]   BondStereo set to STEREONONE and single bond directions set to NONE.
  unseen_vec = model.wv.word_vec(unseen)
  vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys
  unseen_vec = model.wv.word_vec(unseen)
  vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys


In [58]:
Y = dfs['IC50, mmg/ml']
Y2 = dfb['IC50']

In [60]:
# лучшие скоры для IC50
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_seed=0).fit(np.vstack((x_[tr], X2[Y2<310])), np.concatenate((y_[tr], Y2[Y2<310]), 0))
    cat1 = CatBoostRegressor(verbose=0, random_seed=1).fit(np.vstack((x_[tr], X2[Y2<310])), np.concatenate((y_[tr], Y2[Y2<310]), 0))

    tr_scores.append(r2_score(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(r2_score(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.9200878383282312 0.9144158836108776 0.9017079254227198 0.9156014198694417 0.9027504504257565
0.7669829854617771 0.7794918856966079 0.803640679701646 0.7937784229704656 0.8175915306699788
0.7922971009000951




In [61]:
Y = dfs['CC50-MDCK, mmg/ml']

In [73]:
# лучшие скоры для СС50
x_, y_ = X[Y<600], Y[Y<600].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_state=0, depth=3).fit(x_[tr], y_[tr])
    cat1 = CatBoostRegressor(verbose=0, random_state=1, depth=3).fit(x_[tr], y_[tr])

    tr_scores.append(r2_score(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(r2_score(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.910811943894764 0.8953204680313808 0.8923404665438956 0.9051742630208762 0.8983650565901389
0.39684402818211706 0.4665628927796095 0.49026221396202774 0.366034274558133 0.4725362207170042
0.43844792603977834


