In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [17]:
SEED = 42
PATH = '../data/'

In [18]:
smiles_train_df = pd.read_csv(PATH + "smiles_train_set.csv")
smiles_test_df = pd.read_csv(PATH + "smiles_test_set_public.csv")

train_df = pd.read_csv(PATH + "train_data.csv")
test_df = pd.read_csv(PATH + "test_set_public.csv")

In [19]:
smiles_train_df.smiles

0                                CCCCC
1                           CCCC(C)CCC
2                       CCC(C(OC)=O)CC
3                            CCCCC(C)C
4                    CC(C)(C)CC(C)(C)C
                     ...              
1382    CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC
1383                        CCCC(C)CCC
1384                    CCC(C(OC)=O)CC
1385                           CCCCCCC
1386                         CCCCC(C)C
Name: smiles, Length: 1387, dtype: object

# Предобработка для кэтбуста

In [21]:
smiles_train_df_ft = smiles_train_df.copy()
smiles_test_df_ft = smiles_test_df.copy()

In [32]:
polymers_test_df = pd.read_csv('test_polymer_counts.csv')
polymers_train_df = pd.read_csv('train_polymer_counts.csv')

oil_df_train = train_df[['oil_type','blend_id']]
oil_df_test = test_df[['oil_type','blend_id']]

In [33]:
all_smiles = list(set(smiles_train_df.smiles.unique()))+list(set(smiles_test_df.smiles.unique()))
len(all_smiles)

109

In [34]:
from rdkit import Chem
from rdkit.Chem import Descriptors

# Пример SMILES
wts = {}
logs = {}
tps = {}
for smiles in all_smiles:
    try:
        molecule = Chem.MolFromSmiles(smiles)
        wts[smiles] = Descriptors.MolWt(molecule)
        logs[smiles] = Descriptors.MolLogP(molecule)
        tps[smiles] = Descriptors.TPSA(molecule)
    except:
        wts[smiles] = None
        logs[smiles] = None
        tps[smiles] = None

[19:54:03] SMILES Parse Error: syntax error while parsing: O=S(C1=CC=C([C18H21])C=C1)(O)=O
[19:54:03] SMILES Parse Error: Failed parsing SMILES 'O=S(C1=CC=C([C18H21])C=C1)(O)=O' for input: 'O=S(C1=CC=C([C18H21])C=C1)(O)=O'
[19:54:03] Explicit valence for atom # 5 C, 5, is greater than permitted
[19:54:03] Explicit valence for atom # 6 C, 5, is greater than permitted
[19:54:03] Explicit valence for atom # 5 C, 5, is greater than permitted
[19:54:03] Explicit valence for atom # 6 C, 5, is greater than permitted


In [35]:
tt = list()
for a in all_smiles:
    for i in a:
        tt.append(i)
all_symb = set(tt)

In [36]:
all_symb

{'(',
 ')',
 '.',
 '/',
 '1',
 '2',
 '3',
 '8',
 '=',
 'B',
 'C',
 'H',
 'N',
 'O',
 'P',
 'S',
 'Z',
 '[',
 '\\',
 ']',
 'a',
 'l',
 'n'}

In [37]:
oil_df_train.drop_duplicates(subset=['blend_id'],inplace=True)
oil_df_test.drop_duplicates(subset=['blend_id'],inplace=True)

In [38]:
from rdkit import Chem
from rdkit.Chem import Descriptors

# Функция для вычисления дескрипторов молекулы на основе SMILES
def MolWt(smiles,blend,df):
    if (smiles in df.loc[df.blend_id == blend].smiles.unique()):
        return wts[smiles]
    else:
        return -1
def LogP(smiles,blend,df):
    if (smiles in df.loc[df.blend_id == blend].smiles.unique()):
        return logs[smiles]
    else:
        return -1

def TPSA(smiles,blend,df):
    if (smiles in df.loc[df.blend_id == blend].smiles.unique()):
        return tps[smiles]
    else:
        return -1

# Пример SMILES
for i,smiles in enumerate(all_smiles):
    smiles_train_df_ft[smiles+' MolWt'] = smiles_train_df_ft.blend_id.map(lambda x: MolWt(smiles,x,smiles_train_df_ft))
    smiles_train_df_ft[smiles+' LogP'] = smiles_train_df_ft.blend_id.map(lambda x: LogP(smiles,x,smiles_train_df_ft))
    smiles_train_df_ft[smiles+' TPSA'] = smiles_train_df_ft.blend_id.map(lambda x: TPSA(smiles,x,smiles_train_df_ft))
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


In [39]:
w_oil_df_train = pd.read_csv('train_w_oil_prop.csv')
w_oil_df_test = pd.read_csv('test_w_oil_prop.csv')

In [40]:
def make1(blend):
    i = 0
    for a in smiles_train_df.loc[smiles_train_df.blend_id == blend].smiles.unique():
        i+= a.count(symb)
    return i

for symb in all_symb:
    smiles_train_df_ft[symb] = smiles_train_df_ft.blend_id.map(lambda blend: make1(blend))
    

smiles_train_df_ft = smiles_train_df_ft.drop(['smiles'],axis=1)
smiles_train_df_ft = smiles_train_df_ft.dropna(subset=['oil_property_param_value'])

smiles_train_df_ft['target'] = smiles_train_df_ft.oil_property_param_value
smiles_train_df_ft = smiles_train_df_ft.drop(['oil_property_param_value'],axis=1)
smiles_train_df_ft = smiles_train_df_ft.drop_duplicates(subset='blend_id')

smiles_train_df_ft = smiles_train_df_ft.merge(oil_df_train, how='left', on='blend_id')
smiles_train_df_ft = smiles_train_df_ft.merge(w_oil_df_train, how='left', on='blend_id')
smiles_train_df_ft = smiles_train_df_ft.merge(polymers_train_df, how='left', on='blend_id')

smiles_train_df_ft = smiles_train_df_ft.drop(['Unnamed: 0'], axis=1)

In [41]:
X = smiles_train_df_ft.drop(['target','blend_id'],axis=1)
y = smiles_train_df_ft.target
X = X.drop(['ad7e6027-00b8-4c27-918c-d1561f949ad8'],axis = 1)

features = [col for col in X.columns]
num_feat = [col for col in features if col not in ['oil_type']]
cat_feat = ['oil_type']
X[cat_feat] = X[cat_feat].fillna('no_data')

In [42]:
for i,smiles in enumerate(all_smiles):
    smiles_test_df_ft[smiles+' MolWt'] = smiles_test_df_ft.blend_id.map(lambda x: MolWt(smiles,x,smiles_test_df_ft))
    smiles_test_df_ft[smiles+' LogP'] = smiles_test_df_ft.blend_id.map(lambda x: LogP(smiles,x,smiles_test_df_ft))
    smiles_test_df_ft[smiles+' TPSA'] = smiles_test_df_ft.blend_id.map(lambda x: TPSA(smiles,x,smiles_test_df_ft))
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


In [43]:
def make_test(blend,smiles):
    if (smiles in smiles_test_df.loc[smiles_test_df.blend_id == blend].smiles.unique()):
        return 1
    return 0

def make1_test(blend):
    i = 0
    for a in smiles_test_df.loc[smiles_test_df.blend_id == blend].smiles.unique():
        i+= a.count(symb)
    return i


for symb in all_symb:
    smiles_test_df_ft[symb] = smiles_test_df_ft.blend_id.map(lambda blend: make1_test(blend))
    
smiles_test_df_ft = smiles_test_df_ft.drop(['smiles'],axis=1)

smiles_test_df_ft = smiles_test_df_ft.drop_duplicates(subset='blend_id')
smiles_test_df_ft = smiles_test_df_ft.merge(w_oil_df_test, how='left', on='blend_id')
smiles_test_df_ft = smiles_test_df_ft.merge(oil_df_test, how='left', on='blend_id')
smiles_test_df_ft = smiles_test_df_ft.merge(polymers_test_df, how='left', on='blend_id')

smiles_test_df_ft = smiles_test_df_ft.drop(['Unnamed: 0'],axis=1)

In [44]:
X_test = smiles_test_df_ft.drop(['blend_id'],axis=1)
X_test = X_test.drop(['ad7e6027-00b8-4c27-918c-d1561f949ad8'],axis = 1)

X_test[cat_feat] = X_test[cat_feat].fillna('no_data')

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=SEED
)

In [15]:
X_rdy=data.drop(["oil_type","target","blend_id"],axis=1)
y=data["target"]
X_rdy

Unnamed: 0.1,Unnamed: 0,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC MolWt,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC LogP,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC TPSA,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC MolWt,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC LogP,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC TPSA,CCCCCCCCN(C1=CC=CC=C1)C2=CC=CC=C2CCCC MolWt,CCCCCCCCN(C1=CC=CC=C1)C2=CC=CC=C2CCCC LogP,CCCCCCCCN(C1=CC=CC=C1)C2=CC=CC=C2CCCC TPSA,...,22370b44-a93d-4636-b272-5a375c84777b,26c8d0fa-b75b-48cc-b1a2-6af86ffe194a,3c0a6ca1-dd8e-4b51-8609-6e2c675a6f61,5ea08f8a-e2a4-42c1-b84d-1db5771d802a,823f364f-3ccc-4dd5-8fe7-2f23ec37c13b,cd4c898e-82d1-484a-b56a-2fe0e9c2ac0f,d734cbad-e7e1-4919-90e9-028f45a87219,yes_count,no_count,unknown_count
0,0,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,88,270,144
1,1,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,88,256,88
2,2,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,44,216,132
3,3,352.691,9.5662,0.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,256,172,172
4,4,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,396,1233,405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,333,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,132,192,66
334,334,-1.000,-1.0000,-1.0,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,...,,,,,,,,132,282,132
335,335,-1.000,-1.0000,-1.0,366.718,10.1004,0.0,-1.0,-1.0,-1.0,...,,,,,,,,130,186,186
336,336,-1.000,-1.0000,-1.0,366.718,10.1004,0.0,-1.0,-1.0,-1.0,...,,,,,,,,172,176,176


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [27]:
from catboost import CatBoostRegressor
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
simple = CatBoostRegressor(iterations = 10000, devices="gpu",verbose=5000)
pipeline=make_pipeline(SimpleImputer(),StandardScaler())

X_r=pipeline.fit_transform(X_rdy, y)

In [28]:
X_r.shape

(338, 319)

In [34]:
X_r=pipeline.fit_transform(X_rdy)

In [4]:
data=pd.read_csv("train_data1.csv")

In [45]:
y_train.shape,X_train.shape

((34, 319), (304, 319))

In [42]:
X_val.shape,y_val.shape

((304,), (34,))

In [43]:
y.shape

(338,)

In [46]:
X_train,X_val,y_train,y_val=train_test_split(X_r,y,test_size=0.1)


In [65]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
model=Sequential([
    Dense(338,activation="relu"),
    Dense(100,activation="relu"),
    Dense(1,activation=None)


    
])
model.compile(optimizer=Adam(learning_rate=0.1), loss='mean_squared_error', metrics=['mean_absolute_error'])

In [66]:
model.fit(x=X_r,
    y=y,
    batch_size=30,
          validation_split=0.1,
    epochs=30)

Epoch 1/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 12745345024.0000 - mean_absolute_error: 68617.5625 - val_loss: 4469802496.0000 - val_mean_absolute_error: 49477.0352
Epoch 2/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5876570112.0000 - mean_absolute_error: 48346.5664 - val_loss: 6858472960.0000 - val_mean_absolute_error: 51386.4023
Epoch 3/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6254894080.0000 - mean_absolute_error: 42723.7109 - val_loss: 1986848384.0000 - val_mean_absolute_error: 30780.8535
Epoch 4/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1841511808.0000 - mean_absolute_error: 25140.4297 - val_loss: 2214219520.0000 - val_mean_absolute_error: 26948.5098
Epoch 5/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1742404224.0000 - mean_absolute_error: 25168.8418 - val_loss: 2091074688.0

<keras.src.callbacks.history.History at 0x26afbade090>

In [68]:
from sklearn.metrics import mean_absolute_error
preds = model.predict(X_val)
mean_absolute_error(preds,y_val)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


10009.428601433248

In [None]:
pd.DataFrame({'col': features,
                  'importance': simple.get_feature_importance()}).sort_values('importance', ascending=False)

In [None]:
import optuna


def objective(trial):
    param = {
        'verbose': False,
        'iterations': 100,
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1),
        "sampling_frequency": trial.suggest_categorical("sampling_frequency", ["PerTree","PerTreeLevel" ]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 14),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
    }
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)


    model = CatBoostRegressor(**param, cat_features=cat_feat)
    return abs(cross_val_score(model, X, y, cv=5,scoring="neg_mean_absolute_error").mean())

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

In [None]:
best_params = study.best_params

In [None]:
best_model = CatBoostRegressor(**best_params,iterations = 100, cat_features = cat_feat, verbose=50)
best_model.fit(X_train,y_train)
best_preds = best_model.predict(X_val)
mean_absolute_error(best_preds,y_val)

In [None]:
best_model = CatBoostRegressor(**best_params,iterations = 4000, cat_features = cat_feat, verbose=500)
best_model.fit(X,y)

In [81]:
X_test2

Unnamed: 0,CCCCC MolWt,CCCCC LogP,CCCCC TPSA,O=C(OCCCCCCCCCC)CCCCCC1=CC=CC=C1 MolWt,O=C(OCCCCCCCCCC)CCCCCC1=CC=CC=C1 LogP,O=C(OCCCCCCCCCC)CCCCCC1=CC=CC=C1 TPSA,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC MolWt,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC LogP,CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC TPSA,COC(C(C)(CC)C)=O MolWt,...,22370b44-a93d-4636-b272-5a375c84777b,26c8d0fa-b75b-48cc-b1a2-6af86ffe194a,3c0a6ca1-dd8e-4b51-8609-6e2c675a6f61,5ea08f8a-e2a4-42c1-b84d-1db5771d802a,823f364f-3ccc-4dd5-8fe7-2f23ec37c13b,cd4c898e-82d1-484a-b56a-2fe0e9c2ac0f,d734cbad-e7e1-4919-90e9-028f45a87219,yes_count,no_count,unknown_count
0,72.151,2.1965,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,130.187,...,,,,,,,,140,360,150
1,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,0,348,96
2,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,315,287,140
3,72.151,2.1965,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,70,485,275
4,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,0,464,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,224,928,256
134,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,1482,1300,1300
135,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,700,1064,476
136,-1.000,-1.0000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000,...,,,,,,,,2109,1591,1591


In [None]:
X_testr=pipeline.transform(X_test2)
test_preds = pipeline.predict(X_testr)

In [None]:
preds_df = pd.DataFrame({'blend_id': smiles_test_df_ft.blend_id,'preds':test_preds})
preds_df.to_csv('catcat.csv', encoding = 'UTF-8', index=False)

In [None]:
from sklearn.model_selection import cross_val_score
best_model = CatBoostRegressor(**best_params,iterations = 2500, cat_features = cat_feat, early_stopping_rounds=200, verbose=50)
abs(cross_val_score(best_model, X, y, cv=5,scoring="neg_mean_absolute_error").mean())  # cv=5 определяет 5-кратную кросс-валидацию

In [None]:
abs(np.array([-24593.88450765, -33400.87463158, -21770.91822504, -24473.41020017,
       -16041.64026754]).mean())