In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

<h2>Preprocessing</h2>

In [6]:
df = pd.read_csv("../data/train_data_gan.csv")
df = df.drop(["Unnamed: 0", "blend_id"], axis=1)

In [49]:
df.min()

CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC MolWt    -1.0
CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC LogP     -1.0
CCCCCCC(C(C)CCCCCCC)CC(C)CCCCCC TPSA     -1.0
CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC MolWt     -1.0
CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC LogP      -1.0
                                         ... 
d734cbad-e7e1-4919-90e9-028f45a87219      NaN
yes_count                                 0.0
no_count                                 64.0
unknown_count                             3.0
oil_property_param_value                -43.0
Length: 329, dtype: float64

Add target value

In [7]:
df["oil_property_param_value"] = pd.read_csv("file-2 2/smiles_train_set.csv").drop("smiles", axis=1)\
    .drop_duplicates().dropna(subset=["oil_property_param_value"]).oil_property_param_value

In [8]:
labelencoder = LabelEncoder()
df["oil_type"] = labelencoder.fit_transform(df.oil_type.values)

<h2>GAN Generation</h2>

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [10]:
# Разделить на обучающую и тестовую выборку перед заполнением NaN
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Заполнить NaN медианным значением отдельно для обучающей и тестовой выборки
for column in train_df.columns:
    median_value = train_df[column].median()
    train_df[column].fillna(median_value, inplace=True)
    test_df[column].fillna(median_value, inplace=True)

# Нормализация данных
scaler = MinMaxScaler()
train_df_scaled = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
test_df_scaled = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

In [11]:
# Создание архитектуры генератора и дискриминатора
def build_generator(latent_dim, data_shape):
    model = Sequential()
    model.add(Dense(128, input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.02))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.02))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(data_shape[1], activation='tanh'))
    return model

def build_discriminator(data_shape):
    model = Sequential()
    model.add(Dense(256, input_dim=data_shape[1]))
    model.add(LeakyReLU(alpha=0.02))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.02))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [12]:
def build_gan(generator, discriminator):
    model = Sequential()
    # Сначала дискриминатор необучаем
    discriminator.trainable = False
    model.add(generator)
    model.add(discriminator)
    return model

# Задаем размерность шума
latent_dim = 100

# Собираем и компилируем дискриминатор
discriminator = build_discriminator(data_shape=df.shape)
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])

# Собираем генератор
generator = build_generator(latent_dim, data_shape=df.shape)

# Собираем GAN, комбинируя генератор и дискриминатор
gan = build_gan(generator, discriminator)
gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [54]:
def train(gan, generator, discriminator, df, latent_dim, epochs=200, batch_size=32):
    half_batch = int(batch_size / 2)
    for epoch in range(epochs):
        # ---------------------
        #  Обучение дискриминатора
        # ---------------------
        idx = np.random.randint(0, df.shape[0], half_batch)
        real_data = df.iloc[idx].values
        noise = np.random.normal(0, 1, (half_batch, latent_dim))
        generated_data = generator.predict(noise)
        real_y = np.ones((half_batch, 1)).astype(np.float32)
        fake_y = np.zeros((half_batch, 1)).astype(np.float32)
        d_loss_real = discriminator.train_on_batch(real_data, real_y)
        d_loss_fake = discriminator.train_on_batch(generated_data, fake_y)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # ---------------------
        #  Обучение генератора
        # ---------------------
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_y = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_y)
        
        # Отображаем прогресс
        if epoch % 10 == 0:
            print(f"{epoch} d_loss: {d_loss[0]}, d_acc: {100*d_loss[1]}, g_loss: {g_loss}")

In [55]:
train(gan, generator, discriminator, df, latent_dim)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
0 d_loss: nan, d_acc: 17.256277799606323, g_loss: 0.6140080094337463
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
10 d_loss: nan, d_acc: 16.07465147972107, g_loss: 0.5961006283760071
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/ste

In [58]:
def generate_data(generator, latent_dim, num_samples):
    noise = np.random.normal(0, 1, (num_samples, latent_dim))
    generated_data = generator.predict(noise)
    return generated_data

new_data = generate_data(generator, latent_dim, num_samples=500)
new_df = pd.DataFrame(new_data, columns=df.columns)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


<h3>Normalization of quantized values</h3>

In [60]:
new_df["oil_property_param_value"] = (new_df.oil_property_param_value+1)*426500

<h3>Save</h3>

In [45]:
generator.save("gan.h5")

