In [1]:
!pip install catboost
!pip install rdkit

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3
Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [2]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import Descriptors, PandasTools, AllChem
import xgboost as xgb

In [3]:
# Загрузите данные
df = pd.read_csv('/content/drive/MyDrive/NTO/НТО_ИИ_ФИНАЛ_23/Информация о связывании медицинского радионуклида различными молекулами.csv')

# Добавьте молекулярные дескрипторы
PandasTools.AddMoleculeColumnToFrame(df, 'smiles')
df['MolWt'] = df['ROMol'].apply(lambda mol: Descriptors.ExactMolWt(mol))
df['LogP'] = df['ROMol'].apply(lambda mol: Descriptors.MolLogP(mol))
df['HBA'] = df['ROMol'].apply(lambda mol: Descriptors.NumHAcceptors(mol))
# df['HBD'] = df['ROMol'].apply(lambda mol: Descriptors.NumHDonors(mol))


# Создайте молекулярные отпечатки
fp_list = []
for mol in df['ROMol']:
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512)
    fp_list.append(fp)

# Преобразуйте молекулярные отпечатки в DataFrame
fp_df = pd.DataFrame(data=[[bit for bit in line] for line in fp_list], columns=[f'FP_{i}' for i in range(512)])

# Удалите ненужные столбцы
df.drop(['ROMol', 'id', 'smiles'], axis=1, inplace=True)

# Объедините данные с молекулярными отпечатками
df = pd.concat([df, fp_df], axis=1)
df

Unnamed: 0,lgK,MolWt,LogP,HBA,FP_0,FP_1,FP_2,FP_3,FP_4,FP_5,...,FP_502,FP_503,FP_504,FP_505,FP_506,FP_507,FP_508,FP_509,FP_510,FP_511
0,3.41,104.047344,-0.1581,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.25,132.078644,0.6221,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.22,130.062994,0.3761,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2.78,144.078644,0.7662,2,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2.82,152.047344,0.8046,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,1.64,167.021858,1.2930,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
243,3.13,104.047344,-0.1581,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244,2.76,118.062994,0.2320,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245,2.99,118.062994,0.0879,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('lgK', axis=1), df['lgK'], test_size=0.3, random_state=42)

In [5]:
model = CatBoostRegressor(iterations=1000, learning_rate=0.2, depth=6, random_state=42)
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=True)

# Предскажите значения для тестовой выборки
y_pred = model.predict(X_test)

# Вычислите среднеквадратичное отклонение (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')


0:	learn: 5.9788222	test: 4.3319411	best: 4.3319411 (0)	total: 60.7ms	remaining: 1m
1:	learn: 5.4653360	test: 3.9639874	best: 3.9639874 (1)	total: 75.2ms	remaining: 37.5s
2:	learn: 5.0931836	test: 3.8152736	best: 3.8152736 (2)	total: 97.9ms	remaining: 32.5s
3:	learn: 4.6755522	test: 3.4634268	best: 3.4634268 (3)	total: 106ms	remaining: 26.4s
4:	learn: 4.2930960	test: 3.2645645	best: 3.2645645 (4)	total: 118ms	remaining: 23.4s
5:	learn: 4.0394083	test: 3.2103821	best: 3.2103821 (5)	total: 133ms	remaining: 22s
6:	learn: 3.8195023	test: 3.1716404	best: 3.1716404 (6)	total: 167ms	remaining: 23.8s
7:	learn: 3.6619387	test: 3.1639912	best: 3.1639912 (7)	total: 174ms	remaining: 21.6s
8:	learn: 3.5124940	test: 3.1039002	best: 3.1039002 (8)	total: 186ms	remaining: 20.4s
9:	learn: 3.3482664	test: 3.1222085	best: 3.1039002 (8)	total: 198ms	remaining: 19.6s
10:	learn: 3.2216824	test: 3.0925173	best: 3.0925173 (10)	total: 205ms	remaining: 18.4s
11:	learn: 3.1227114	test: 3.0877707	best: 3.0877707 (

In [37]:
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=False)

<catboost.core.CatBoostRegressor at 0x79b6bdd50ca0>

In [38]:
# Предскажите значения для тестовой выборки
y_pred = model.predict(X_test)

# Вычислите среднеквадратичное отклонение (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')


RMSE: 2.2119600373016226


In [6]:
# Загрузите список столбцов
columns = ['MolWt', 'LogP', 'HBA']
for i in range(1024):
    columns.append(f'FP_{i}')

# Определите функцию для предсказания lgK по строке smiles
def predict_lgk(smiles):
    # Создайте молекулярные дескрипторы
    mol = Chem.MolFromSmiles(smiles)
    MolWt = Descriptors.ExactMolWt(mol)
    LogP = Descriptors.MolLogP(mol)
    HBA = Descriptors.NumHAcceptors(mol)

    # Создайте молекулярные отпечатки
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)

    # Создайте DataFrame с дескрипторами и отпечатками
    data = pd.DataFrame([[MolWt, LogP, HBA] + [bit for bit in fp]], columns=columns)

    # Предскажите lgK
    lgk = model.predict(data)[0]

    return lgk

In [12]:
bdf = pd.read_csv('/content/drive/MyDrive/NTO/НТО_ИИ_ФИНАЛ_23/iupac_high-confidence_v1_0.csv')
bdf = bdf.drop_duplicates(subset='SMILES')
bdf

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,name_contributors,num_name_contributors,original_IUPAC_nicknames,source,unique_ID,pressure,acidity_label
0,0001,CC(=N)N,pKAH1,12.4,25,"C=0.04, f+/- taken equal f+/-(KCl)",E3bh,Uncertain,S29,,,"methane, amidino-","['OPSIN_name1', 'chemaxon_name1']",2.0,,perrin,perrin0001,,AH
2,0002,CN,pKAH1,10.657,25,I=0.03 to 1.5,E1b,Reliable,H31,,"- Other measurements: refs. B72, C20, H16, M56...","methane, amino-","['OPSIN_name1', 'chemaxon_name1']",2.0,,perrin,perrin0002,,AH
22,0003,NCC(N)=O,pKAH1,7.95,25,I=0.01,E3bg,Approximate,D11,,"- Other measurements: C19, L34","methane, C-amino-C-carbamoyl-","['OPSIN_name1', 'chemaxon_name1']",2.0,,perrin,perrin0003,,AH
26,0005,N#CCN,pKAH1,5.34,25,C=0.01,E3bg,Approximate,S75,,,"methane, aminocyano-","['OPSIN_name1', 'chemaxon_name1']",2.0,,perrin,perrin0005,,AH
28,0006,CCOC(OCC)[SiH2]CN,pKB,4.80,room_temp.,,E3bg,Uncertain,N28,,,"methane, amino(diethoxymethylsilyl)-","['OPSIN_name1', 'chemaxon_name1']",2.0,,perrin,perrin0006,,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21130,6492,O=C1CCC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCNC(=...,pK1,8.65,20,I=0.1(NaNO3),E3bg,Approximate,A57,,,Desferri-ferrioxamin,"['cirpy_nickname1', 'pubchem_nickname1', 'chem...",3.0,Nocardamin,serjeant,serjeant6492,,A
21133,6494,Cc1c(C2(c3cc(C(C)C)c(O)c(Br)c3C)OS(=O)(=O)c3cc...,pK1,-0.66,not_stated,C=0.00016,E3d,Very uncertain,G60,,,"4,4'-(3H-2,1-Benzoxathiol-3-ylidene)bis(2-chro...","['cirpy_nickname1', 'pubchem_nickname1', 'chem...",3.0,Bromothymol blue,serjeant,serjeant6494,,A
21135,6497,O=C1c2ccccc2C(=O)C1c1ccc(C2C(=O)c3ccccc3C2=O)c...,pK1,3.68,20+0.5,,O5,,S14,,,"2,2'-(1,4-Naphthalenediyl)bis(1,3-indandione)","['OPSIN_name1', 'chemaxon_name1']",2.0,,serjeant,serjeant6497,,A
21136,6499,O=C1c2ccccc2C(=O)C1c1ccc(-c2ccc(C3C(=O)c4ccccc...,pK1,4.11,20+0.5,,O5,,S142,,,"2,2'-(4,4'-Biphenyldiyl)bis(1,3-indandione)","['OPSIN_name1', 'chemaxon_name1']",2.0,,serjeant,serjeant6499,,A


In [29]:
sdf = pd.read_csv('/content/drive/MyDrive/NTO/НТО_ИИ_ФИНАЛ_23/train.csv')
lgKlist = []
c = 0

for i in sdf['CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1']:
  c += 1
  lgKlist.append(predict_lgk(i))
  print(c, ' / ', len(sdf['CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1']))
sdf['lgK'] = lgKlist
sdf

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
5000  /  9999
5001  /  9999
5002  /  9999
5003  /  9999
5004  /  9999
5005  /  9999
5006  /  9999
5007  /  9999
5008  /  9999
5009  /  9999
5010  /  9999
5011  /  9999
5012  /  9999
5013  /  9999
5014  /  9999
5015  /  9999
5016  /  9999
5017  /  9999
5018  /  9999
5019  /  9999
5020  /  9999
5021  /  9999
5022  /  9999
5023  /  9999
5024  /  9999
5025  /  9999
5026  /  9999
5027  /  9999
5028  /  9999
5029  /  9999
5030  /  9999
5031  /  9999
5032  /  9999
5033  /  9999
5034  /  9999
5035  /  9999
5036  /  9999
5037  /  9999
5038  /  9999
5039  /  9999
5040  /  9999
5041  /  9999
5042  /  9999
5043  /  9999
5044  /  9999
5045  /  9999
5046  /  9999
5047  /  9999
5048  /  9999
5049  /  9999
5050  /  9999
5051  /  9999
5052  /  9999
5053  /  9999
5054  /  9999
5055  /  9999
5056  /  9999
5057  /  9999
5058  /  9999
5059  /  9999
5060  /  9999
5061  /  9999
5062  /  9999
5063  /  9999
5064  /  9999
5065  / 

Unnamed: 0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78,lgK
0,CCOc1ccccc1O,1.68,4.185007
1,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,4.64,7.395511
2,Cc1cccc(C)n1,1.68,3.113526
3,CC(=O)/C=C/C1C(C)=CCCC1(C)C,3.85,2.545316
4,C=CCC1(C(C)(C)C)C(=O)NC(=O)NC1=O,1.64,4.283425
...,...,...,...
9994,CNC1CCc2c(OC)cccc2C1C,2.42,6.032828
9995,Nc1ncc(Cc2cccc(Cl)c2Cl)c(N)n1,2.81,6.702618
9996,c1ccc(N2CCCCC2)cc1,2.98,2.966062
9997,CCCCCCN(SN(C)C(=O)O/N=C(\C)SC)C(=O)N(C)C,3.30,12.820523


In [31]:
sdf_sorted = sdf.sort_values(by='lgK', ascending=False)
sdf_top_100 = sdf_sorted.head(100)
mean_logK = sdf_top_100['lgK'].mean()

print("Среднее значение logK для первых ста строк:", mean_logK)

Среднее значение logK для первых ста строк: 14.230071847836832


In [33]:
sdf_top_100

Unnamed: 0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78,lgK
1461,CCOC(=O)NC1=C(N2CC2)C(=O)C(NC(=O)OCC)=C(N2CC2)...,-0.02,18.236986
5539,O=C1C(NCC(O)CO)=C(N2CC2)C(=O)C(NCC(O)CO)=C1N1CC1,-1.97,17.077983
9655,Nc1ncnc2c1nc([Se]Cc1ccccc1)n2C1OC2COP(=O)(O)OC...,-1.18,16.583410
180,Nc1ncnc2c1nc([Se]Cc1ccccc1)n2C1OC2COP(=O)(OCc3...,1.67,16.222221
1414,OCC1OC(OC2(CO)OC(CO)C(O)C2O)C(O)C(O)C1O,-3.70,16.150964
...,...,...,...
570,CCOC(=O)Nc1ccc2cc3ccc4c(c3nc2c1)COCN4C(=O)OCC,3.12,13.217804
7975,CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)C...,3.26,13.188939
7449,CCOCc1ccccc1NC(=O)OCCN1CCCCC1,2.72,13.177389
384,CCCCCOc1ccccc1NC(=O)OCCN1CCOCC1,3.70,13.174576


In [35]:
sdf_top_100['CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1'].to_csv('predict_sdf_top_100.csv', index=False, header=False)

In [50]:
# Пример использования
smiles = 'O=C(O)c1cc([N+](=O)[O-])cc([N+](=O)[O-])c1OO'
lgk = predict_lgk(smiles)
print(f'Predicted lgK for {smiles}: {lgk}')

Predicted lgK for O=C(O)c1cc([N+](=O)[O-])cc([N+](=O)[O-])c1OO: 6.826264167432743


In [62]:
df100f = ['O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1'] + list(pd.read_csv('/content/drive/MyDrive/NTO/НТО_ИИ_ФИНАЛ_23/first_101_rows.csv')['O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1'])
df100f

100

In [None]:
new_df100f = []
for i in df100f:
  if i.count('O') + i.count('')
  i + 'O'