# Import

In [None]:
# Importação de bibliotecas e configuração de caminhos
import os
import re
from time import sleep
from datetime import datetime
import pandas as pd
import requests
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from packaging import version
from IPython import display
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras import layers, losses, Model, Sequential
from tensorflow.keras.layers import Dense, Dropout

# Caminhos dos dados
SPECTRA_PATH = '../datasets/spectras/'
DATASET_PATH = '../datasets'
MODELS_PATH = '../models'
HISTORY_PATH = '../history'

sns.set(style="whitegrid")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dicionário de grupos funcionais e seus SMARTS
func_grp_smarts = {
    'alkane':'[CX4;H0,H1,H2,H4]',
    'methyl':'[CH3]',
    'alkene':'[CX3]=[CX3]',
    'alkyne':'[CX2]#C',
    'alcohols':'[#6][OX2H]',
    'amines':'[NX3;H2,H1;!$(NC=O)]',
    'nitriles':'[NX1]#[CX2]',
    'aromatics':'[$([cX3](:*):*),$([cX2+](:*):*)]',
    'alkyl halides':'[#6][F,Cl,Br,I]',
    'esters':'[#6][CX3](=O)[OX2H0][#6]',
    'ketones':'[#6][CX3](=O)[#6]',
    'aldehydes':'[CX3H1](=O)[#6]',
    'carboxylic acids':'[CX3](=O)[OX2H1]',
    'ether': '[OD2]([#6])[#6]',
    'acyl halides':'[CX3](=[OX1])[F,Cl,Br,I]',
    'amides':'[NX3][CX3](=[OX1])[#6]',
    'nitro':'[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'}

column_names = list(func_grp_smarts.keys())

In [None]:
# Carregar autoencoder salvo

autoencoder_load = keras.models.load_model(os.path.join(MODELS_PATH, 'autoencoder_model_2025_04_07.keras'))
autoencoder_load.summary()
encoder_model_load = keras.Model(inputs=autoencoder_load.input, outputs=autoencoder_load.layers[4].output)  # Encoder até a camada de codificação

In [None]:
# Carregar autoencoder salvo
from tensorflow import keras

autoencoder_path = os.path.join(MODELS_PATH, 'autoencoder_model_2025_04_07.keras')
autoencoder_load = keras.models.load_model(autoencoder_path)
autoencoder_load.summary()
encoder_model_load = keras.Model(inputs=autoencoder_load.input, outputs=autoencoder_load.layers[4].output)  # Encoder até a camada de codificação

8241

In [None]:
# Carregar dataset de enriquecimento
df_enrich_path = os.path.join(DATASET_PATH, 'df_enrich.csv')
df_enrich = pd.read_csv(df_enrich_path)
dataset_y = df_enrich.copy()
dataset_y.index = dataset_y['CAS']
print(f'Número de CAS únicos: {len(dataset_y.CAS.unique())}')

(1030, 24636)

In [None]:
# Carregar espectros processados
# df_spectra_all = pd.read_csv(os.path.join(DATASET_PATH, 'df_spectra_all_mixture_interpolate.csv'))
df_spectra_path = os.path.join(DATASET_PATH, 'df_spectra_all_mixture_interpolate.parquet')
df_spectra_all = pd.read_parquet(df_spectra_path)
mean_cols = [x for x in df_spectra_all.columns if 'mean' in x]
min_cols = [x for x in df_spectra_all.columns if 'min' in x]
max_cols = [x for x in df_spectra_all.columns if 'max' in x]
all_cols = mean_cols + min_cols + max_cols
print(f'Shape do DataFrame de espectros: {df_spectra_all.shape}')

In [41]:
def load_history(file_path):
  return pickle.load(open(f'{file_path}', 'rb'))


def get_dataset(df_spectra_all, dataset_y, agg_func):

  if agg_func == 'mean':
    dataset_x = df_spectra_all[mean_cols].copy()
  elif agg_func == 'min':
    dataset_x = df_spectra_all[min_cols].copy()

  elif agg_func == 'max':
    dataset_x = df_spectra_all[max_cols].copy()

  # dataset_x = df_spectra_all.copy()
  dataset_x = dataset_x.T
  dataset_x.columns = ['bin_' + str(x) for x in dataset_x.columns]
  dataset_x.reset_index(inplace=True)
  dataset_x.index = dataset_x['index'].apply(lambda x: x.split('_')[0])

  dataset_y = dataset_y[dataset_y['yunits'] == 'ABSORBANCE']

  dataset_final = pd.merge(dataset_y, dataset_x, left_index = True, right_index = True, how='inner')

  return dataset_final

def find_best_epoch(history):
    """
    Finds the epoch with the lowest validation loss.

    Args:
        history: Training history object from Keras model.fit().

    Returns:
        A tuple containing the best epoch number and its corresponding validation loss.
        Returns None if history object is invalid or empty.
    """
    if not history or 'val_loss' not in history.history:
        return None

    val_losses = history.history['val_loss']
    best_epoch = np.argmin(val_losses)  # Index of the minimum validation loss
    best_val_loss = val_losses[best_epoch]

    return best_epoch, best_val_loss

In [None]:
def load_history(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def get_dataset(df_spectra_all, dataset_y, agg_func):
    if agg_func == 'mean':
        dataset_x = df_spectra_all[mean_cols].copy()
    elif agg_func == 'min':
        dataset_x = df_spectra_all[min_cols].copy()
    elif agg_func == 'max':
        dataset_x = df_spectra_all[max_cols].copy()
    else:
        raise ValueError(f"agg_func '{agg_func}' não reconhecido. Use 'mean', 'min' ou 'max'.")

    dataset_x = dataset_x.T
    dataset_x.columns = [f'bin_{x}' for x in dataset_x.columns]
    dataset_x.reset_index(inplace=True)
    dataset_x.index = dataset_x['index'].apply(lambda x: x.split('_')[0])

    dataset_y = dataset_y[dataset_y['yunits'] == 'ABSORBANCE']

    dataset_final = pd.merge(dataset_y, dataset_x, left_index=True, right_index=True, how='inner')
    return dataset_final

def find_best_epoch(history):
    """
    Encontra a época com menor loss de validação.
    Args:
        history: Objeto de histórico do treinamento do Keras.
    Returns:
        Uma tupla (best_epoch, best_val_loss) ou None.
    """
    if not history or 'val_loss' not in history.history:
        return None
    val_losses = history.history['val_loss']
    best_epoch = np.argmin(val_losses)
    best_val_loss = val_losses[best_epoch]
    return best_epoch, best_val_loss

In [43]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_validation:", X_validation.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_validation:", Y_validation.shape)
print("Shape of Y_test:", Y_test.shape)

Shape of X_train: (6157, 1030)
Shape of X_validation: (1129, 1030)
Shape of X_test: (924, 1030)
Shape of Y_train: (6157, 17)
Shape of Y_validation: (1129, 17)
Shape of Y_test: (924, 17)


In [44]:
def compute_model_analysis(agg_func, data_prep, current_date, callbacks=False, save_history=True):
  dataset_final = get_dataset(df_spectra_all, dataset_y, agg_func)

  X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
  X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

  if data_prep == 'encoder':
    X = encoder_model_load.predict(X)

  Y = dataset_final[column_names].apply(lambda x: x.astype(int))
  # X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.25, random_state=42)

  # X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.45, random_state=42)

  X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.15, random_state=42,stratify = Y[column_names].sum(axis=1))

  X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.30, random_state=42 , stratify = Y_test_temp[column_names].sum(axis=1))


  tf.random.set_seed(42)

  model = keras.Sequential([
      layers.Input(shape=(X_train.shape[1],)),
      layers.Dense(255, activation='relu'),
      layers.Dropout(0.2),
      layers.Dense(215, activation='relu'),
      layers.Dropout(0.2),
      layers.Dense(165, activation='relu'),
      layers.Dense(Y_train.shape[1],activation='sigmoid')
  ])

  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy','f1_score','binary_accuracy','precision', 'recall', 'binary_crossentropy'])

  #Define the callbacks
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min', start_from_epoch = 30, restore_best_weights=True)
  reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, min_lr=1e-4, verbose=0, mode='min')
  # mcp_save = tf.keras.callbacks.ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
  if callbacks:
    callbacks_list = [early_stopping, reduce_lr]
  else:
    callbacks_list = []

  history = model.fit(X_train, Y_train,
            validation_data=(X_validation, Y_validation),
            callbacks=callbacks_list,
            epochs=100, verbose=1,shuffle = True)

  # Evaluate the model on the validation set
  val_loss = history.history['val_loss'][-1] # Get the last validation loss

  # Evaluate the model
  y_pred = model.predict(X_validation)
  y_pred = (y_pred > 0.5).astype(int)

  compose_name = f'mlp_{agg_func}_{data_prep}_{current_date}_call_{callbacks}'
  model.save(f'{compose_name}.keras')

  if save_history:
    # Save the history object
    with open(f'{compose_name}_history.pkl', 'wb') as f:
      pickle.dump(history.history, f)


  best_epoch, best_val_loss = find_best_epoch(history)


  plt.figure(figsize=(16, 10))
  plt.suptitle(f'Métricas Treinamento - Época Escolhida {best_epoch} - {best_val_loss:.3f} Validação Loss Function')


  plt.subplot(3, 1, 1)
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.axvline(x=best_epoch, color='red', linestyle='--')
  plt.ylabel('Acurácia')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='upper left')
  plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

  plt.subplot(3, 1, 2)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.axvline(x=best_epoch, color='red', linestyle='--')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))



  plt.subplot(3, 1, 3)
  plt.plot(history.history['binary_accuracy'])
  plt.plot(history.history['val_binary_accuracy'])
  plt.axvline(x=best_epoch, color='red', linestyle='--')
  plt.ylabel('Acurácia Binarizada')
  plt.xlabel('Epoch')
  plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))
  plt.savefig(f'{compose_name}.png')
  plt.show()

  return history





In [None]:
callbacks = True
mlp_min_normal_history = compute_model_analysis('min', 'normal', current_date, callbacks=callbacks)
mlp_min_encoder_history = compute_model_analysis('min', 'encoder', current_date, callbacks=callbacks)
mlp_max_normal_history = compute_model_analysis('max', 'normal', current_date,  callbacks=callbacks)
mlp_max_encoder_history = compute_model_analysis('max', 'encoder', current_date,  callbacks=callbacks)
mlp_mean_normal_history = compute_model_analysis('mean', 'normal', current_date,  callbacks=callbacks)
mlp_mean_encoder_history = compute_model_analysis('mean', 'encoder', current_date,  callbacks=callbacks)


Epoch 1/100


In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(16, 10))

# Plot for Min function
plt.plot(mlp_min_encoder_history.history['loss'], label='Min Auto encoder - Train', color='green', marker='o', linestyle='dashed')
plt.plot(mlp_min_encoder_history.history['val_loss'], label='Min Auto - Validation', color='green', linestyle='solid')

# Plot for Min function
plt.plot(mlp_min_normal_history.history['loss'], label='Min Normal encoder - Train', color='green', marker='', linestyle='dashed')
plt.plot(mlp_min_normal_history.history['val_loss'], label='Min Normal - Validation', color='green', linestyle='solid')


# Plot for Max function
plt.plot(mlp_max_encoder_history.history['loss'], label='Max - Train', color='blue', marker='', linestyle='dashed')
plt.plot(mlp_max_encoder_history.history['val_loss'], label='Max - Validation', color='blue', linestyle='solid')

# Plot for Mean function
plt.plot(mlp_mean_encoder_history.history['loss'], label='Mean - Train', color='red', marker='', linestyle='dashed')
plt.plot(mlp_mean_encoder_history.history['val_loss'], label='Mean - Validation', color='red', linestyle='solid')


# plt.axvline(x=best_epoch, color='red', linestyle='--')
plt.ylabel('Função de Custo')
plt.xlabel('Época')
plt.legend()
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

In [None]:
plt.figure(figsize=(16, 10))

# Plot for Min function
plt.plot(mlp_min_encoder_history.history['loss'], label='Min - Train', color='green', marker='', linestyle='dashed')
plt.plot(mlp_min_encoder_history.history['val_loss'], label='Min - Validation', color='green')

# Plot for Max function
plt.plot(mlp_max_encoder_history.history['loss'], label='Max - Train', color='blue', marker='', linestyle='dashed')
plt.plot(mlp_max_encoder_history.history['val_loss'], label='Max - Validation', color='blue')

# Plot for Mean function
plt.plot(mlp_mean_encoder_history.history['loss'], label='Mean - Train', color='red', marker='', linestyle='dashed')
plt.plot(mlp_mean_encoder_history.history['val_loss'], label='Mean - Validation', color='red')


# plt.axvline(x=best_epoch, color='red', linestyle='--')
plt.ylabel('Função de Custo')
plt.xlabel('Época')
plt.legend()
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

In [None]:
mlp_min_encoder_history
mlp_mean_encoder_history
mlp_max_encoder_history

In [None]:
plt.figure(figsize=(16, 10))
plt.suptitle(f'Métricas Treinamento - Época Escolhida {best_epoch} - {best_val_loss:.3f} Validação Loss Function')


plt.subplot(3, 1, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.axvline(x=best_epoch, color='red', linestyle='--')
plt.ylabel('Acurácia')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

plt.subplot(3, 1, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.axvline(x=best_epoch, color='red', linestyle='--')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))



plt.subplot(3, 1, 3)
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.axvline(x=best_epoch, color='red', linestyle='--')
plt.ylabel('Acurácia Binarizada')
plt.xlabel('Epoch')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))
plt.savefig(f'{compose_name}.png')
plt.show()







# Análises Finais

In [None]:
def compute_general_metrics(y_real,y_pred,i=None):

  if i is None:
    names = 'general'
    accur = metrics.accuracy_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred, average='weighted')
    prec = metrics.precision_score(y_real, y_pred, average='weighted')
    rec = metrics.recall_score(y_real, y_pred, average='weighted')
    jacc = float(metrics.jaccard_score(y_real, y_pred, average='weighted'))
    hamm = metrics.hamming_loss(y_real, y_pred)
    log_loss = metrics.log_loss(y_real, y_pred)

  elif i is not None:
    names = column_names[i]
    accur = metrics.accuracy_score(y_real.iloc[:, i], y_pred[:, i])
    f1 = metrics.f1_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    prec = metrics.precision_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    rec = metrics.recall_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    jacc = float(metrics.jaccard_score(y_real.iloc[:, i], y_pred[:, i]))
    hamm = metrics.hamming_loss(y_real.iloc[:, i], y_pred[:, i])
    log_loss = metrics.log_loss(y_real.iloc[:, i], y_pred[:, i])

  return [names, accur, f1, prec, rec, jacc, hamm, log_loss]

In [None]:
from sklearn import metrics

def compute_all_comparisons(func_model,data_model):
  model_prefix = f'mlp_{func_model}_{data_model}'
  model_suffix = f'_2025_08_18_call_True'
  model_name = model_prefix + model_suffix
  model = keras.models.load_model(f'/content/{model_name}.keras')

  dataset_final = get_dataset(df_spectra_all, dataset_y, func_model)

  X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
  X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

  Y = dataset_final[column_names].apply(lambda x: x.astype(int))
  X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.25, random_state=42)

  X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.45, random_state=42)

  if data_model == 'encoder':
    X_train = pd.DataFrame(encoder_model_load.predict(X_train))
    X_test = pd.DataFrame(encoder_model_load.predict(X_test))
    X_test_temp = pd.DataFrame(encoder_model_load.predict(X_test_temp))

  train_predict = (model.predict(X_train) > 0.5).astype(int)
  test_predict = (model.predict(X_test_temp) > 0.5).astype(int)

  metrics_list = ['accuracy','f1_score','precision', 'recall', 'jaccard','hamming','log_loss']

  train_metrics = [compute_general_metrics(Y_train,train_predict)] + [compute_general_metrics(Y_train,train_predict,x) for x in range(len(column_names))]
  test_metrics = [compute_general_metrics(Y_test_temp,test_predict)] + [compute_general_metrics(Y_test_temp,test_predict,x) for x in range(len(column_names))]

  full_metrics_df = pd.DataFrame(train_metrics)
  full_metrics_df.columns = ['metric'] + metrics_list
  full_metrics_df['data'] = 'train'

  test_metrics_df = pd.DataFrame(test_metrics)
  test_metrics_df.columns = ['metric'] + metrics_list
  test_metrics_df['data'] = 'test'

  merged_df = pd.concat([full_metrics_df, test_metrics_df], ignore_index=True)
  merged_df['model'] = model_prefix

  return merged_df

results_list = []
for func_model in ['min','max','mean']:
  for data_model in ['encoder','normal']:
    results_list.append(compute_all_comparisons(func_model,data_model))


In [None]:
all_metrics = pd.concat(results_list)
# all_metrics[(all_metrics['model'] == 'mlp_min_normal') & (all_metrics['data'] == 'test')][['metric','accuracy','f1_score','hamming','data']]
all_metrics[(all_metrics['data'] == 'test')].sort_values(by=['hamming','model'], ascending = True)

In [None]:
all_metrics = pd.concat(results_list)
# .groupby(['model','data','metric'])
all_metrics.to_csv('all_metrics_mlp.csv',index=False)

In [None]:
all_metrics.groupby(['model','data','metric']).sum().to_csv('all_metrics_mlp_agg.csv',index=False)