In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, MolSentence
model = word2vec.Word2Vec.load('model_300dim.pkl')

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, Parallel
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model as lin
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from catboost import CatBoostRegressor

from tqdm import trange, tqdm

In [2]:
def sentences2vec(sentences, model, unseen=None):
    keys = set(model.wv.index_to_key)
    vec = []
    if unseen:
        unseen_vec = model.wv.word_vec(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.word_vec(y) for y in sentence
                            if y in set(sentence) & keys]))
    return np.array(vec)

In [3]:
dfs = pd.read_excel('1400.xlsx')
dfb = pd.read_excel('35000.xlsx')
dfs.head()

Unnamed: 0,Title,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,Molecular weight,Hydrogen bond acceptors,Hydrogen bond donors,Polar SA,SMILES,Pictures
0,1007-Ya-213,2.7,500.0,185.185185,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,50.0
1,1007-Ya-213,0.7,447.0,638.571429,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,51.0
2,1008-Ya-187,9.9,144.0,14.545455,250.431,1,0,15.6,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,52.0
3,1009-As-106,8.3,500.0,60.240964,222.377,1,0,15.6,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,53.0
4,1010-Ya-208,39.4,143.0,3.629442,239.361,2,0,29.54,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,54.0


In [4]:
dfs['mol'] = dfs['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

#Extract descriptors
# dfs['tpsa'] = dfs['mol'].apply(lambda x: Descriptors.TPSA(x))
# dfs['mol_w'] = dfs['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
# dfs['num_valence_electrons'] = dfs['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
# dfs['num_heteroatoms'] = dfs['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))

# Calculate Morgan fingerprint
RADIUS = 1
dfs['sentence'] = dfs.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], RADIUS)), axis=1)
X = sentences2vec(dfs['sentence'], model, unseen='UNK')

[00:13:36] Conflicting single bond directions around double bond at index 55.
[00:13:36]   BondStereo set to STEREONONE and single bond directions set to NONE.
  unseen_vec = model.wv.word_vec(unseen)
  vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys


In [5]:
Y = dfs['IC50, mmg/ml']
scaler = StandardScaler().fit(Y[Y<310].values.reshape(-1,1))

# Эксперименты

In [60]:
# метрика для сравнения с бейзлайном
scores=cross_val_score(
    CatBoostRegressor(verbose=0),
    X[Y<310], scaler.transform(Y[Y<310].values[:,None]),
    cv=ShuffleSplit(5, train_size=844/X.shape[0], random_state=42),
    scoring=make_scorer(mean_squared_error)
)
print(*scores)
print(scores.mean())

0.7527590130162616 0.7346272793834088 0.7756177678498603 0.8508485473439311 0.8396880551564234
0.7907081325499771


In [59]:
# детерменированность модели
scores=cross_val_score(
    CatBoostRegressor(verbose=0),
    X[Y<310], Y[Y<310],
    cv=ShuffleSplit(5, train_size=844/X.shape[0], random_state=42)
)
print(*scores)
print(scores.mean())

0.14839987032706736 0.23149252773124784 0.1645566540139063 0.18522514798746736 0.2706365614812847
0.20006215230819474


In [85]:
# C увеличением размера структур становится чуть хуже в среднем
RADIUS = 2
X = sentences2vec(dfs.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], RADIUS)), axis=1), model, unseen='UNK')
scores=cross_val_score(
    CatBoostRegressor(verbose=0),
    X[Y<310], Y[Y<310],
    cv=ShuffleSplit(5, train_size=844/X.shape[0], random_state=42)
)
print(*scores)
print(scores.mean())

  unseen_vec = model.wv.word_vec(unseen)
  vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys


0.12380771004400482 0.23607717446589116 0.1566860092178055 0.17503544900610124 0.24714859306168313
0.18775098715909716


In [97]:
# переобучение есть
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = CatBoostRegressor(verbose=0, random_seed=0)
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.9699013162199536 0.9576584989395325 0.9585744202381216 0.9664182077241886 0.9574434172409289
0.14839987032706736 0.23149252773124784 0.1645566540139063 0.18522514798746736 0.2706365614812847
0.20006215230819474




In [130]:
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = KNeighborsRegressor(9)
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.35095021836864204 0.32654181470931165 0.36081349733327195 0.361737566840167 0.3213555241395937
0.20488595677579546 0.2544350494078351 0.16593775693751234 0.15627937357948085 0.21803731988553054
0.19991509131723087




In [145]:
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = KNeighborsRegressor(15, weights='distance')
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.9793474756858217 0.9681860655540724 0.9688656235773144 0.9759661683721067 0.9692737385017425
0.1693461535313039 0.23356838998889096 0.17354977760705081 0.15822437904803444 0.2399334695105878
0.19492443393717357




In [161]:
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = KNeighborsRegressor(10, metric='cosine')
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.33373681893769425 0.3139887972535953 0.35655491842121567 0.3665608824337916 0.30243763745309615
0.16892845394679068 0.23563479346563232 0.18201623903992215 0.18122234860119846 0.21625418059822132
0.196811203130353




In [132]:
# стэкинг по random_seed
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_seed=0).fit(x_[tr], y_[tr])
    cat1 = KNeighborsRegressor(9).fit(x_[tr], y_[tr])

    tr_scores.append(r2_score(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(r2_score(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.794106857774281 0.7769264797826585 0.7868504824021574 0.7938527525896671 0.7751499724968648
0.2450629175103337 0.2886849031826818 0.22490734467838736 0.22260916585854085 0.28701675831941875
0.25365621790987253




In [135]:
# mse стэкинг по random_seed
x_, y_ = X[Y<310][:,], scaler.transform(Y[Y<310].values[:,None])[:,0]
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_seed=0).fit(x_[tr], y_[tr])
    cat1 = KNeighborsRegressor(9).fit(x_[tr], y_[tr])

    tr_scores.append(mean_squared_error(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(mean_squared_error(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.22317784867397267 0.23035028055548395 0.22405909369547086 0.19925859433444185 0.199070984563035
0.667315172801947 0.6799562662284634 0.719588749152046 0.8118093700205321 0.8208301627032477
0.7398999441812473




In [31]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')

x_, y_ = X[Y<310][:,:], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = lin.Lasso(2)
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

0it [00:00, ?it/s]

                      

0.3719609899584321 0.3265582381806946 0.34015715356099085 0.3294053283578676 0.2969074866389756
0.0760145861021908 0.1830600630705449 0.1573983936820743 0.1682876450221934 0.19514925099109992
0.15598198777362066


In [49]:
# умная голова заражается переобучением катбуста
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    model = StackingRegressor([
        ('cat', CatBoostRegressor(verbose=0, random_seed=0)),
        ('knn', KNeighborsRegressor(9)),
        ('lin', lin.Lasso(2))
    ], lin.LinearRegression(), cv=2).fit(x_[tr], y_[tr])

    tr_scores.append(r2_score(y_[tr], model.predict(x_[tr])))
    scores.append(r2_score(y_[vl], model.predict(x_[vl])))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.7308801982669424 0.6005946382234156 0.6786620709798579 0.8063414271040305 0.7711212361985645
0.2289482095957004 0.2536151157639944 0.23992316876813868 0.2418974976623579 0.24911117019965467
0.2426990323979692




In [53]:
x_, y_ = X[Y<310][:,:], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat = BaggingRegressor(n_estimators=30)
    cat.fit(x_[tr], y_[tr])
    tr_scores.append(cat.score(x_[tr], y_[tr]))
    scores.append(cat.score(x_[vl], y_[vl]))
print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.8642782866685619 0.858980319067788 0.8539375009022928 0.883216439603434 0.8445181427648951
0.10105757295882145 0.23055254351818766 0.20652607631519915 0.17407955621510363 0.227532269023776
0.18794960360621757




In [54]:
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_seed=0).fit(x_[tr], y_[tr])
    cat1 = KNeighborsRegressor(9).fit(x_[tr], y_[tr])
    cat2 = lin.Lasso(2).fit(x_[tr], y_[tr])
    cat3 = BaggingRegressor(n_estimators=30).fit(x_[tr], y_[tr])

    tr_scores.append(r2_score(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr]),
        cat2.predict(x_[tr]),
        cat3.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(r2_score(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl]),
        cat2.predict(x_[vl]),
        cat3.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.7610285772475057 0.7354940577641109 0.7460820687980421 0.7520816092320881 0.7312390573799125
0.22385155677003343 0.28371837488812934 0.2592438662887604 0.24266079804200213 0.2782758718525641
0.2575500935682979




# Лучшая модель

In [32]:
x_, y_ = X[Y<310][:,], Y[Y<310].values
tr_scores = []
scores = []
for tr, vl in tqdm(ShuffleSplit(5, train_size=844/X.shape[0], random_state=42).split(x_), leave=False):
    cat0 = CatBoostRegressor(verbose=0, random_seed=0).fit(x_[tr], y_[tr])
    cat1 = KNeighborsRegressor(9).fit(x_[tr], y_[tr])
    cat2 = lin.Lasso(2).fit(x_[tr], y_[tr])

    tr_scores.append(r2_score(y_[tr], np.stack((
        cat0.predict(x_[tr]),
        cat1.predict(x_[tr]),
        cat2.predict(x_[tr])
    )).mean(axis=0)))
    scores.append(r2_score(y_[vl], np.stack((
        cat0.predict(x_[vl]),
        cat1.predict(x_[vl]),
        cat2.predict(x_[vl])
    )).mean(axis=0)))

print(*tr_scores)
print(*scores)
print(np.mean(scores))

                      

0.7008415983602987 0.6723914268675493 0.6893128648940572 0.6920824006661079 0.6656280633307394
0.23545554748513797 0.28646157233239944 0.24285475122502387 0.24210238827945485 0.2842303692744417
0.25822092571929156


