# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import pickle
from time import sleep
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses, Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU, Conv1D, MaxPooling1D, Flatten
from IPython import display
import math

# Configuração de caminhos relativos
BASE_PATH = os.getcwd()
DATASET_DIR = os.path.join(BASE_PATH, 'datasets')
MODELS_DIR = os.path.join(BASE_PATH, 'models')
RESULTS_DIR = os.path.join(BASE_PATH, 'results')

# Diretório de históricos (opcional)
HISTORY_DIR = os.path.join(BASE_PATH, 'history') if os.path.exists(os.path.join(BASE_PATH, 'history')) else RESULTS_DIR

# Dicionário de grupos funcionais e SMARTS
func_grp_smarts = {
    'alkane':'[CX4;H0,H1,H2,H4]',
    'methyl':'[CH3]',
    'alkene':'[CX3]=[CX3]',
    'alkyne':'[CX2]#C',
    'alcohols':'[#6][OX2H]',
    'amines':'[NX3;H2,H1;!$(NC=O)]',
    'nitriles':'[NX1]#[CX2]',
    'aromatics':'[$([cX3](:*):*),$([cX2+](:*):*)]',
    'alkyl halides':'[#6][F,Cl,Br,I]',
    'esters':'[#6][CX3](=O)[OX2H0][#6]',
    'ketones':'[#6][CX3](=O)[#6]',
    'aldehydes':'[CX3H1](=O)[#6]',
    'carboxylic acids':'[CX3](=O)[OX2H1]',
    'ether': '[OD2]([#6])[#6]',
    'acyl halides':'[CX3](=[OX1])[F,Cl,Br,I]',
    'amides':'[NX3][CX3](=[OX1])[#6]',
    'nitro':'[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'
}

column_names = list(func_grp_smarts.keys())

# Caminhos dos arquivos de dados e modelos
DATASET_PATH = os.path.join(DATASET_DIR, 'spectras')
ENRICH_CSV = os.path.join(DATASET_DIR, 'df_enrich.csv')
SPECTRA_CSV = os.path.join(DATASET_DIR, 'df_spectra_all_mixture_interpolate.csv')
AUTOENCODER_MODEL_PATH = os.path.join(MODELS_DIR, 'autoencoder_model_2025_04_07.keras')

In [None]:
autoencoder_load = keras.models.load_model(AUTOENCODER_MODEL_PATH)
autoencoder_load.summary()
encoder_model_load = keras.Model(inputs=autoencoder_load.input, outputs=autoencoder_load.layers[4].output)  # Supondo que o encoder são as 4 primeiras camadas

In [None]:
df_enrich = pd.read_csv(ENRICH_CSV)
dataset_y = df_enrich.copy()
dataset_y.index = dataset_y['CAS']

len(dataset_y.CAS.unique())

8241

In [None]:
df_spectra_all = pd.read_csv(SPECTRA_CSV)
mean_cols = [x for x in df_spectra_all.columns if 'mean' in x]
min_cols = [x for x in df_spectra_all.columns if 'min' in x]
max_cols = [x for x in df_spectra_all.columns if 'max' in x]
all_cols = mean_cols + min_cols + max_cols

df_spectra_all.shape

(1030, 24636)

# Functions

In [None]:
def get_dataset(df_spectra_all, dataset_y, agg_func):

  if agg_func == 'mean':
    dataset_x = df_spectra_all[mean_cols].copy()
  elif agg_func == 'min':
    dataset_x = df_spectra_all[min_cols].copy()

  elif agg_func == 'max':
    dataset_x = df_spectra_all[max_cols].copy()

  # dataset_x = df_spectra_all.copy()
  dataset_x = dataset_x.T
  dataset_x.columns = ['bin_' + str(x) for x in dataset_x.columns]
  dataset_x.reset_index(inplace=True)
  dataset_x.index = dataset_x['index'].apply(lambda x: x.split('_')[0])

  dataset_y = dataset_y[dataset_y['yunits'] == 'ABSORBANCE']

  dataset_final = pd.merge(dataset_y, dataset_x, left_index = True, right_index = True, how='inner')

  return dataset_final

def find_best_epoch(history):
    """
    Finds the epoch with the lowest validation loss.

    Args:
        history: Training history object from Keras model.fit().

    Returns:
        A tuple containing the best epoch number and its corresponding validation loss.
        Returns None if history object is invalid or empty.
    """
    if not history or 'val_loss' not in history.history:
        return None

    val_losses = history.history['val_loss']
    best_epoch = np.argmin(val_losses)  # Index of the minimum validation loss
    best_val_loss = val_losses[best_epoch]

    return best_epoch, best_val_loss

In [7]:
agg_func = 'max'
current_date = datetime.now().strftime('%Y_%m_%d')

dataset_final = get_dataset(df_spectra_all, dataset_y, agg_func)

X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

Y = dataset_final[column_names].apply(lambda x: x.astype(int))
X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.25, random_state=42)

X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.45, random_state=42)

# Reshape input data for LSTM
X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
X_validation = np.array(X_validation).reshape((X_validation.shape[0], X_validation.shape[1], 1))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))
X_test_temp = np.array(X_test_temp).reshape((X_test_temp.shape[0], X_test_temp.shape[1], 1))



In [None]:
def compute_model_analysis(agg_func, data_prep, current_date, batch_size=380, epochs=100, callbacks=False, save_history=True, save_figure=False):
    dataset_final = get_dataset(df_spectra_all, dataset_y, agg_func)

    X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

    if data_prep == 'encoder':
        X = encoder_model_load.predict(X)

    Y = dataset_final[column_names].apply(lambda x: x.astype(int))
    X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.20, random_state=42, stratify=Y[column_names].sum(axis=1))

    X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.30, random_state=42, stratify=Y_test_temp[column_names].sum(axis=1))
    data_type = np.float32

    # Reshape input data for Conv1D
    X_train_reshaped = np.expand_dims(np.array(X_train, dtype=data_type), axis=2)
    X_validation_reshaped = np.expand_dims(np.array(X_validation, dtype=data_type), axis=2)
    X_test_reshaped = np.expand_dims(np.array(X_test, dtype=data_type), axis=2)
    X_test_temp_reshaped = np.expand_dims(np.array(X_test_temp, dtype=data_type), axis=2)

    model = Sequential([
        layers.Input(shape=(X_train_reshaped.shape[1], 1)),
        layers.Conv1D(filters=36, kernel_size=6, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=3),
        layers.Conv1D(filters=36, kernel_size=6, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=3),
        layers.Conv1D(filters=128, kernel_size=6, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(Y_train.shape[1], activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'binary_accuracy', 'precision', 'recall', 'binary_crossentropy'])

    # Callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min', start_from_epoch=50, restore_best_weights=True)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=1e-8, verbose=0, mode='min')
    callbacks_list = [early_stopping, reduce_lr] if callbacks else []

    history = model.fit(X_train_reshaped, Y_train,
                        validation_data=(X_validation_reshaped, Y_validation),
                        callbacks=callbacks_list,
                        epochs=epochs, verbose=1, shuffle=True, batch_size=batch_size)

    # Avaliação e salvamento
    val_loss = history.history['val_loss'][-1]

    y_pred = model.predict(X_validation_reshaped)
    y_pred = (y_pred > 0.5).astype(int)

    compose_name = f'conv1d_{agg_func}_{data_prep}_{current_date}_call_{callbacks}'
    model_save_path = os.path.join(MODELS_DIR, f'{compose_name}.keras')
    model.save(model_save_path)

    if save_history:
        history_save_path = os.path.join(HISTORY_DIR if 'HISTORY_DIR' in globals() else RESULTS_DIR, f'{compose_name}_history.pkl')
        with open(history_save_path, 'wb') as f:
            pickle.dump(history.history, f)

    best_epoch, best_val_loss = find_best_epoch(history)

    if save_figure:
        plt.figure(figsize=(16, 10))
        plt.suptitle(f'Métricas Treinamento - Época Escolhida {best_epoch} - {best_val_loss:.3f} Validação Loss Function')

        plt.subplot(3, 1, 1)
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.axvline(x=best_epoch, color='red', linestyle='--')
        plt.ylabel('Acurácia')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper left')
        plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

        plt.subplot(3, 1, 2)
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.axvline(x=best_epoch, color='red', linestyle='--')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))

        plt.subplot(3, 1, 3)
        plt.plot(history.history['binary_accuracy'])
        plt.plot(history.history['val_binary_accuracy'])
        plt.axvline(x=best_epoch, color='red', linestyle='--')
        plt.ylabel('Acurácia Binarizada')
        plt.xlabel('Epoch')
        plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:.3f}".format(x)))
        fig_save_path = os.path.join(RESULTS_DIR, f'{compose_name}.png')
        plt.savefig(fig_save_path)
        plt.show()

    return history

In [9]:
callbacks = False
epochs = 100
batch_size = 600
compute_model_analysis('min', 'normal', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)
compute_model_analysis('min', 'encoder', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)
compute_model_analysis('max', 'normal', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)
compute_model_analysis('max', 'encoder', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)
compute_model_analysis('mean', 'normal', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)
compute_model_analysis('mean', 'encoder', current_date, batch_size = batch_size,epochs = epochs, callbacks=callbacks)


Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step - accuracy: 0.5265 - binary_accuracy: 0.7339 - binary_crossentropy: 0.6245 - f1_score: 0.0568 - loss: 0.6245 - precision: 0.3922 - recall: 0.4688 - val_accuracy: 0.7119 - val_binary_accuracy: 0.8575 - val_binary_crossentropy: 0.3873 - val_f1_score: 0.0544 - val_loss: 0.3873 - val_precision: 0.6664 - val_recall: 0.5670
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step - accuracy: 0.5836 - binary_accuracy: 0.8632 - binary_crossentropy: 0.3707 - f1_score: 0.0856 - loss: 0.3707 - precision: 0.6961 - recall: 0.5468 - val_accuracy: 0.5883 - val_binary_accuracy: 0.8727 - val_binary_crossentropy: 0.3262 - val_f1_score: 0.0999 - val_loss: 0.3262 - val_precision: 0.7551 - val_recall: 0.5322
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - accuracy: 0.5736 - binary_accuracy: 0.8724 - binary_crossentropy: 0.3197 - f1_score: 0.0972 - loss: 0.3

<keras.src.callbacks.history.History at 0x7905f038adb0>

# Comparações

In [10]:
def compute_general_metrics(y_real,y_pred,i=None):

  if i is None:
    names = 'general'
    accur = metrics.accuracy_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred, average='weighted')
    prec = metrics.precision_score(y_real, y_pred, average='weighted')
    rec = metrics.recall_score(y_real, y_pred, average='weighted')
    jacc = float(metrics.jaccard_score(y_real, y_pred, average='weighted'))
    hamm = metrics.hamming_loss(y_real, y_pred)
    log_loss = metrics.log_loss(y_real, y_pred)

  elif i is not None:
    names = column_names[i]
    accur = metrics.accuracy_score(y_real.iloc[:, i], y_pred[:, i])
    f1 = metrics.f1_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    prec = metrics.precision_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    rec = metrics.recall_score(y_real.iloc[:, i], y_pred[:, i],zero_division=0)
    jacc = float(metrics.jaccard_score(y_real.iloc[:, i], y_pred[:, i]))
    hamm = metrics.hamming_loss(y_real.iloc[:, i], y_pred[:, i])
    log_loss = metrics.log_loss(y_real.iloc[:, i], y_pred[:, i])

  return [names, accur, f1, prec, rec, jacc, hamm, log_loss]

In [None]:
def compute_all_comparisons(func_model, data_model):
    model_prefix = f'conv1d_{func_model}_{data_model}'
    model_suffix = f'_2025_07_22_call_True'
    model_name = model_prefix + model_suffix
    model_path = os.path.join(MODELS_DIR, f'{model_name}.keras')
    model = keras.models.load_model(model_path)

    dataset_final = get_dataset(df_spectra_all, dataset_y, func_model)

    X = dataset_final[[col for col in dataset_final.columns if 'bin' in col]]
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

    Y = dataset_final[column_names].apply(lambda x: x.astype(int))
    X_train, X_test_temp, Y_train, Y_test_temp = train_test_split(X, Y, test_size=0.25, random_state=42)

    X_validation, X_test, Y_validation, Y_test = train_test_split(X_test_temp, Y_test_temp, test_size=0.45, random_state=42)

    if data_model == 'encoder':
        X_train = pd.DataFrame(encoder_model_load.predict(X_train))
        X_test = pd.DataFrame(encoder_model_load.predict(X_test))
        X_test_temp = pd.DataFrame(encoder_model_load.predict(X_test_temp))

    train_predict = (model.predict(X_train) > 0.5).astype(int)
    test_predict = (model.predict(X_test_temp) > 0.5).astype(int)

    metrics_list = ['accuracy','f1_score','precision', 'recall', 'jaccard','hamming', 'log_loss']

    train_metrics = [compute_general_metrics(Y_train,train_predict)] + [compute_general_metrics(Y_train,train_predict,x) for x in range(len(column_names))]
    test_metrics = [compute_general_metrics(Y_test_temp,test_predict)] + [compute_general_metrics(Y_test_temp,test_predict,x) for x in range(len(column_names))]

    full_metrics_df = pd.DataFrame(train_metrics)
    full_metrics_df.columns = ['metric'] + metrics_list
    full_metrics_df['data'] = 'train'

    test_metrics_df = pd.DataFrame(test_metrics)
    test_metrics_df.columns = ['metric'] + metrics_list
    test_metrics_df['data'] = 'test'

    merged_df = pd.concat([full_metrics_df, test_metrics_df], ignore_index=True)
    merged_df['model'] = model_prefix

    return merged_df

results_list = []
for func_model in ['min','max','mean']:
    for data_model in ['encoder','normal']:
        results_list.append(compute_all_comparisons(func_model, data_model))

ValueError: File not found: filepath=/content/conv1d_min_encoder_2025_07_22_call_True.keras. Please ensure the file is an accessible `.keras` zip file.

In [None]:
all_metrics = pd.concat(results_list).groupby(['model','data','metric']).mean()
all_metrics