In [None]:
# Configuração de ambiente e importação de bibliotecas
from google.colab import drive
drive.mount('/content/drive')

import os
import re
from time import sleep
import pandas as pd
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from packaging import version
from IPython import display
import math
from sklearn.preprocessing import StandardScaler
try:
    import keras
except ImportError:
    import tensorflow as tf

from tensorflow.keras import layers, losses, Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings
import optuna

# Caminhos dos dados
SPECTRA_PATH = '../datasets/spectras/'
DATASET_PATH = '../datasets'

Mounted at /content/drive


In [None]:
# Dicionário de grupos funcionais e seus SMARTS
func_grp_smarts = {
    'alkane':'[CX4;H0,H1,H2,H4]',
    'methyl':'[CH3]',
    'alkene':'[CX3]=[CX3]',
    'alkyne':'[CX2]#C',
    'alcohols':'[#6][OX2H]',
    'amines':'[NX3;H2,H1;!$(NC=O)]',
    'nitriles':'[NX1]#[CX2]',
    'aromatics':'[$([cX3](:*):*),$([cX2+](:*):*)]',
    'alkyl halides':'[#6][F,Cl,Br,I]',
    'esters':'[#6][CX3](=O)[OX2H0][#6]',
    'ketones':'[#6][CX3](=O)[#6]',
    'aldehydes':'[CX3H1](=O)[#6]',
    'carboxylic acids':'[CX3](=O)[OX2H1]',
    'ether': '[OD2]([#6])[#6]',
    'acyl halides':'[CX3](=[OX1])[F,Cl,Br,I]',
    'amides':'[NX3][CX3](=[OX1])[#6]',
    'nitro':'[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'}

column_names = list(func_grp_smarts.keys())

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sqlalchemy-2.0.40-cp311-cp311-manyli

In [None]:
# Carregar dataset de enriquecimento

df_enrich = pd.read_csv(os.path.join(DATASET_PATH, 'df_enrich.csv'))
dataset_y = df_enrich.copy()
dataset_y.index = dataset_y['CAS']

print(f'Número de CAS únicos: {len(dataset_y.CAS.unique())}')

8241

In [None]:
# Carregar espectros processados
df_spectra_all = pd.read_csv(os.path.join(DATASET_PATH, 'df_spectra_all_mixture_interpolate.csv'))
mean_cols = [x for x in df_spectra_all.columns if 'mean' in x]
min_cols = [x for x in df_spectra_all.columns if 'min' in x]
max_cols = [x for x in df_spectra_all.columns if 'max' in x]
all_cols = mean_cols + min_cols + max_cols

print(f'Shape do DataFrame de espectros: {df_spectra_all.shape}')

(1030, 24636)

In [None]:
# Preparar dataset X a partir dos espectros médios

dataset_x = df_spectra_all[mean_cols].copy()
dataset_x = dataset_x.T
dataset_x.columns = ['bin_' + str(x) for x in dataset_x.columns]
dataset_x.reset_index(inplace=True)
dataset_x.index = dataset_x['index'].apply(lambda x: x.split('_')[0])
print(f'Shape do dataset_x: {dataset_x.shape}')
dataset_x.head()

(8211, 1031)


Unnamed: 0_level_0,index,bin_0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,...,bin_1020,bin_1021,bin_1022,bin_1023,bin_1024,bin_1025,bin_1026,bin_1027,bin_1028,bin_1029
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bins,bins_mean,398.9999,402.4999,405.9999,409.4999,412.9999,416.4999,419.9999,423.4999,426.9999,...,3968.9999,3972.4999,3975.9999,3979.4999,3982.9999,3986.4999,3989.9999,3993.4999,3996.9999,4000.4999
71-55-6,71-55-6_x_mean,0.0076,0.0076,0.0076,0.0076,0.0076,0.0076,0.0076,0.0076,0.0076,...,0.0081,0.0081,0.0081,0.0081,0.0081,0.0081,0.0081,0.0081,0.0081,0.0081
84-66-2,84-66-2_mean,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,...,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016
99-94-5,99-94-5_mean,0.0803,0.0803,0.0803,0.0803,0.0803,0.0803,0.0803,0.0803,0.0803,...,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048
99-04-7,99-04-7_mean,0.0043,0.0043,0.0043,0.0043,0.0043,0.0043,0.0043,0.0043,0.0043,...,0.0045,0.0045,0.0045,0.0045,0.0045,0.0045,0.0045,0.0045,0.0045,0.0045


In [7]:
dataset_y = dataset_y[dataset_y['yunits'] == 'ABSORBANCE']
dataset_y.shape

(8210, 37)

In [None]:
# Unir datasets X e Y

dataset_final = pd.merge(dataset_y, dataset_x, left_index=True, right_index=True, how='inner')
print(f'Shape do dataset final: {dataset_final.shape}')

(8210, 1068)

In [None]:
# Separar variáveis de entrada (X) e saída (Y), e dividir em treino, validação e teste

X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

Y = dataset_final[column_names].apply(lambda x: x.astype(int))

X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.25, random_state=42)
X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.45, random_state=42)

In [None]:
# def build_and_compile_model(norm, num_neurons_1, num_neurons_2,num_neurons_3):
def model_mlp( num_neurons_1, num_neurons_2,num_neurons_3):
        model = keras.Sequential([
            # norm,
            layers.Dense(num_neurons_1, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(num_neurons_2, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(num_neurons_3, activation='relu'),
            layers.Dense(Y_train.shape[1],activation='sigmoid')
        ])

        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy','binary_accuracy','precision', 'recall', 'binary_crossentropy'])
        return model

In [None]:
def model_conv1d( num_filters_1, num_filters_2,num_filters_3,pool_size, kernel_size ):
    model = Sequential([
        layers.Input(shape=(X_train.shape[1], 1)),
        layers.Conv1D(filters=num_filters_1, kernel_size= kernel_size, activation='relu'),
        layers.MaxPooling1D(pool_size=pool_size),
        layers.Conv1D(filters=num_filters_2, kernel_size= kernel_size, activation='relu'),
        layers.MaxPooling1D(pool_size=pool_size),
        layers.Conv1D(filters=num_filters_3, kernel_size= kernel_size, activation='relu'),
        layers.MaxPooling1D(pool_size=pool_size),
        layers.Flatten(),
        layers.Dense(Y_train.shape[1], activation='sigmoid')  # Adjust activation based on your task
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',  # Adjust loss based on your task
                  metrics=['accuracy','binary_accuracy','precision', 'recall', 'binary_crossentropy'])
    return model


In [None]:
def objective(trial,model):

    if model == 'MLP':
        min_neurons = 18
        max_neurons = 300
        num_neurons_1 = trial.suggest_int('num_neurons_1', min_neurons, max_neurons)
        num_neurons_2 = trial.suggest_int('num_neurons_2', min_neurons, max_neurons)
        num_neurons_3 = trial.suggest_int('num_neurons_3', min_neurons, max_neurons)

        model = model_mlp(num_neurons_1, num_neurons_2,num_neurons_3)
    elif model == 'CNN':

        min_filters = 32
        max_filters = 128
        num_filters_1 = trial.suggest_int('num_filters_1', min_filters, max_filters)
        num_filters_2 = trial.suggest_int('num_filters_2', min_filters, max_filters)
        num_filters_3 = trial.suggest_int('num_filters_3', min_filters, max_filters)
        pool_size = trial.suggest_int('pool_size', 3, 7)
        kernel_size = trial.suggest_int('kernel_size', 2, 7)

        model = model_conv1d(num_filters_1, num_filters_2,num_filters_3,pool_size, kernel_size)
    else:
      raise Exception('Model not supported')

    #Define the callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.001)


    history = model.fit(X_train, Y_train,
              validation_data=(X_validation, Y_validation),
              callbacks=[early_stopping, reduce_lr],
              epochs=100, verbose=1,shuffle = True)

    # Evaluate the model on the validation set
    val_loss = history.history['val_loss'][-1] # Get the last validation loss

    # Evaluate the model
    y_pred = model.predict(X_validation)
    y_pred = (y_pred > 0.5).astype(int)

    hamming = metrics.hamming_loss(Y_validation, y_pred)

    f1_score = metrics.f1_score(Y_validation, y_pred, average='weighted')

    return f1_score

In [13]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Run the optimization
study.optimize(lambda trial: objective(trial, model='MLP'), n_trials=5, n_jobs=-1, show_progress_bar=True) # Adjust n_trials as needed

# Print the best hyperparameters and loss
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-04-07 22:08:16,157] A new study created in memory with name: no-name-dba649eb-d853-49a5-a7ab-5a530305a169


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
Epoch 1/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.5259 - binary_accuracy: 0.8284 - binary_crossentropy: 0.4225 - loss: 0.4225 - precision: 0.6201 - recall: 0.4797 - val_accuracy: 0.4632 - val_binary_accuracy: 0.9072 - val_binary_crossentropy: 0.2392 - val_loss: 0.2392 - val_precision: 0.8380 - val_recall: 0.6619 - learning_rate: 0.0010
[1m184/193[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.3441 - binary_accuracy: 0.7766 - binary_crossentropy: 0.4688 - loss: 0.4688 - precision: 0.4909 - recall: 0.6143Epoch 2/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - accuracy: 0.3772 - binary_accuracy: 0.8248 - binary_crossentropy: 0.4082 - loss: 0.4082 - precision: 0.5930 - recall: 0.5324 - val_accuracy: 0.4429 - val_binary_accuracy: 0.9142 - val_binary_crossentropy: 0.2196 - val_loss: 0.2196 - val_precision: 0.8444 - val_recall:

In [None]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Run the optimization
study.optimize(lambda trial: objective(trial, model='CNN'), n_trials=5, n_jobs=-1, show_progress_bar=True) # Adjust n_trials as needed

# Print the best hyperparameters and loss
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
