In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from scipy.signal import savgol_filter
import scipy.sparse as sp
import scipy.sparse.linalg as spla
import matplotlib.pyplot as plt

# =========================================================
# 0. Funções de pré-processamento padrão
# =========================================================

def cut_region(wavenumbers, spectra, wn_min=400, wn_max=1800):
    wavenumbers = np.asarray(wavenumbers, dtype=float)
    spectra = np.asarray(spectra, dtype=float)
    mask = (wavenumbers >= wn_min) & (wavenumbers <= wn_max)
    return wavenumbers[mask], spectra[:, mask]

def snv(X):
    X = np.asarray(X, dtype=float)
    mean = X.mean(axis=1, keepdims=True)
    std = X.std(axis=1, keepdims=True)
    return (X - mean) / std

def baseline_asls(y, lam=1e5, p=0.001, niter=10):
    y = np.asarray(y, dtype=float).ravel()
    L = y.size
    D = sp.diags([1, -2, 1], [0, 1, 2], shape=(L-2, L))
    DTD = D.T @ D
    w = np.ones(L)
    for _ in range(niter):
        W = sp.diags(w, 0, shape=(L, L))
        Z = W + lam * DTD
        z = spla.spsolve(Z, w * y)
        w = p * (y > z) + (1 - p) * (y < z)
    return z

def apply_pipeline_standard(wavenumbers, X_raw,
                            wn_min=400, wn_max=1800,
                            sg_window=11, sg_poly=2,
                            asls_lam=1e5, asls_p=0.001, asls_niter=10):
    # 1) Corte
    wn_cut, X_cut = cut_region(wavenumbers, X_raw, wn_min, wn_max)
    # 2) Savitzky–Golay
    X_sg = savgol_filter(X_cut, window_length=sg_window,
                         polyorder=sg_poly, axis=1)
    # 3) Baseline ASL
    X_corrected = np.zeros_like(X_sg)
    for i in range(X_sg.shape[0]):
        y = X_sg[i, :]
        b = baseline_asls(y, lam=asls_lam, p=asls_p, niter=asls_niter)
        X_corrected[i, :] = y - b
    # 4) SNV
    X_snv = snv(X_corrected)
    return wn_cut, X_snv, X_cut

# =========================================================
# 1. Leitura do CSV completo
# =========================================================

csv_path = "NAME.csv"

df = pd.read_csv(csv_path, header=None)

wavenumbers_full = df.iloc[0, 1:].astype(float).values
df_data = df.iloc[1:, :].reset_index(drop=True)
labels_all = df_data.iloc[:, 0].values
X_raw_full = df_data.iloc[:, 1:].astype(float).values

print("Total de espectros:", X_raw_full.shape[0])

# =========================================================
# 2. Dividir em dois subconjuntos: A (treino) e B (aplicação)
#    Exemplo: 70% para A, 30% para B
# =========================================================

indices = np.arange(X_raw_full.shape[0])
idx_A, idx_B = train_test_split(indices, test_size=0.3, random_state=42)

X_raw_A = X_raw_full[idx_A, :]
labels_A = labels_all[idx_A]

X_raw_B = X_raw_full[idx_B, :]
labels_B = labels_all[idx_B]

print("Conjunto A (treino AE):", X_raw_A.shape[0], "espectros")
print("Conjunto B (aplicação AE):", X_raw_B.shape[0], "espectros")

# =========================================================
# 3. Gerar padrão ouro apenas no conjunto A
# =========================================================

wn_cut, X_target_A, X_cut_A = apply_pipeline_standard(
    wavenumbers_full,
    X_raw_A,
    wn_min=400,
    wn_max=1800,
    sg_window=7,
    sg_poly=2,
    asls_lam=1e5,
    asls_p=0.001,
    asls_niter=10,
)

print("Região cortada:", wn_cut[0], "a", wn_cut[-1], "cm-1")
print("X_cut_A (entrada AE):", X_cut_A.shape)
print("X_target_A (padrão ouro):", X_target_A.shape)

# =========================================================
# 4. Preparar dados de treino/val/testo dentro do conjunto A
# =========================================================

# z-score por variável de entrada e alvo
mean_in = X_cut_A.mean(axis=0, keepdims=True)
std_in = X_cut_A.std(axis=0, keepdims=True)
X_in_A_scaled = (X_cut_A - mean_in) / std_in

mean_tgt = X_target_A.mean(axis=0, keepdims=True)
std_tgt = X_target_A.std(axis=0, keepdims=True)
X_target_A_scaled = (X_target_A - mean_tgt) / std_tgt

# Adiciona eixo de canal
X_in_A_scaled = X_in_A_scaled[..., np.newaxis]
X_target_A_scaled = X_target_A_scaled[..., np.newaxis]

# Divide A em treino/val/testo
X_train_in, X_temp_in, X_train_tgt, X_temp_tgt = train_test_split(
    X_in_A_scaled, X_target_A_scaled, test_size=0.3, random_state=42
)
X_val_in, X_test_in, X_val_tgt, X_test_tgt = train_test_split(
    X_temp_in, X_temp_tgt, test_size=0.5, random_state=42
)

print("Treino A:", X_train_in.shape,
      "Val A:", X_val_in.shape,
      "Teste A:", X_test_in.shape)

# =========================================================
# 5. Definir e treinar o AE (apenas usando conjunto A)
# =========================================================

input_shape = X_train_in.shape[1:]  # (n_wn_cut, 1)

def build_conv_autoencoder(input_shape):
    inp = layers.Input(shape=input_shape)
    x = layers.Conv1D(16, kernel_size=10, strides=2,
                      padding="same", activation="relu")(inp)
    x = layers.Conv1D(32, kernel_size=10, strides=3,
                      padding="same", activation="relu")(x)
    x = layers.Conv1D(64, kernel_size=10, strides=3,
                      padding="same", activation="relu")(x)
    encoded = x
    x = layers.Conv1DTranspose(64, kernel_size=10, strides=3,
                               padding="same", activation="relu")(encoded)
    x = layers.Conv1DTranspose(32, kernel_size=10, strides=3,
                               padding="same", activation="relu")(x)
    x = layers.Conv1DTranspose(16, kernel_size=10, strides=2,
                               padding="same", activation="relu")(x)
    x = layers.Conv1DTranspose(1, kernel_size=10, strides=1,
                               padding="same", activation=None)(x)
    x = layers.Cropping1D(cropping=(0, max(0, x.shape[1] - input_shape[0])))(x)
    if x.shape[1] < input_shape[0]:
        pad = input_shape[0] - x.shape[1]
        x = layers.ZeroPadding1D(padding=(0, pad))(x)
    out = x
    model = models.Model(inp, out, name="conv_AE_FTIR")
    return model

autoencoder = build_conv_autoencoder(input_shape)
autoencoder.summary()

autoencoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse"
)

early_stop = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=20,
    restore_best_weights=True
)

history = autoencoder.fit(
    X_train_in, X_train_tgt,
    validation_data=(X_val_in, X_val_tgt),
    epochs=500,
    batch_size=30,
    callbacks=[early_stop],
    verbose=1
)

# =========================================================
# 6. Aplicar AE ao conjunto A (para comparar com padrão)
# =========================================================

X_in_A_scaled_all = ((X_cut_A - mean_in) / std_in)[..., np.newaxis]
X_AE_A_scaled = autoencoder.predict(X_in_A_scaled_all)
X_AE_A_scaled = X_AE_A_scaled[..., 0]

X_AE_A = X_AE_A_scaled * std_tgt + mean_tgt   # saída AE na escala do alvo (SNV)

# =========================================================
# 7. Aplicar AE ao conjunto B (apenas recorte + AE, sem padrão clássico)
# =========================================================

# Recorta B para 1800–400 cm-1
_, X_cut_B = cut_region(wavenumbers_full, X_raw_B,
                        wn_min=400, wn_max=1800)

# Usa a mesma normalização da entrada de A
X_in_B_scaled = (X_cut_B - mean_in) / std_in
X_in_B_scaled = X_in_B_scaled[..., np.newaxis]

X_AE_B_scaled = autoencoder.predict(X_in_B_scaled)
X_AE_B_scaled = X_AE_B_scaled[..., 0]
X_AE_B = X_AE_B_scaled * std_tgt + mean_tgt  # mesma escala SNV das saídas de A

# =========================================================
# 8. Visualização em alguns exemplos de A
# =========================================================

def plot_before_after_A(idx_local, title_prefix="Exemplo A"):
    spec_in = X_cut_A[idx_local, :]
    spec_std = X_target_A[idx_local, :]
    spec_ae = X_AE_A[idx_local, :]
    lab = labels_A[idx_local]
    plt.figure(figsize=(8, 4))
    plt.plot(wn_cut, spec_in, label="Bruto (cortado)", alpha=0.6)
    plt.plot(wn_cut, spec_std, label="Padrão (SG+ASL+SNV)", alpha=0.8)
    plt.plot(wn_cut, spec_ae, label="AE (saída)", alpha=0.8)
    plt.xlabel("Número de onda (cm$^{-1}$)")
    plt.ylabel("Intensidade")
    plt.title(f"{title_prefix} idx_local={idx_local}, classe={lab}")
    plt.legend()
    plt.tight_layout()
    plt.show()

np.random.seed(42)
idx_rand_A = np.random.choice(X_cut_A.shape[0], size=min(5, X_cut_A.shape[0]), replace=False)
for i in idx_rand_A:
    plot_before_after_A(i)

# =========================================================
# 9. Salvar resultados
# =========================================================

# Crie o cabeçalho para os arquivos de dados
header_for_data_files = ['Classe'] + list(wn_cut)

# Conjunto A: padrão ouro + AE
df_A_padr = pd.DataFrame(np.column_stack([labels_A, X_target_A]))
df_A_padr.columns = header_for_data_files # Define o cabeçalho
df_A_padr.to_csv("A_padroes.csv", index=False, header=True)

df_A_AE = pd.DataFrame(np.column_stack([labels_A, X_AE_A]))
df_A_AE.columns = header_for_data_files # Define o cabeçalho
df_A_AE.to_csv("A_AE.csv", index=False, header=True)

# Conjunto B: apenas AE
df_B_AE = pd.DataFrame(np.column_stack([labels_B, X_AE_B]))
df_B_AE.columns = header_for_data_files # Define o cabeçalho
df_B_AE.to_csv("B_AE.csv", index=False, header=True)

# Wavenumbers (cortados)
pd.DataFrame(wn_cut).T.to_csv("wavenumbers_cut.csv", index=False, header=False)

print("Arquivos salvos:")
print("  A_padroes.csv   -> padrão ouro (pipeline clássico) para conjunto A com números de onda no cabeçalho")
print("  A_AE.csv        -> saída do AE para conjunto A com números de onda no cabeçalho")
print("  B_AE.csv        -> saída do AE para conjunto B (aplicação automática) com números de onda no cabeçalho")
print("  wavenumbers_cut.csv -> eixos espectrais 400–1800 cm-1")

In [None]:
combined_data = pd.DataFrame({'wavenumber': wn_cut})

for i, idx_local in enumerate(idx_rand_A):
    spec_in = X_cut_A[idx_local, :]
    spec_std = X_target_A[idx_local, :]
    spec_ae = X_AE_A[idx_local, :]
    lab = labels_A[idx_local]

    combined_data[f'label_example_{i+1}'] = lab
    combined_data[f'raw_cut_example_{i+1}'] = spec_in
    combined_data[f'standard_example_{i+1}'] = spec_std
    combined_data[f'ae_output_example_{i+1}'] = spec_ae

# Save to CSV
output_csv_path = "comparison_examples.csv"
combined_data.to_csv(output_csv_path, index=False)

print(f"Dados dos 5 exemplos salvos em '{output_csv_path}'")

In [None]:
original_indices_for_random_A = idx_A[idx_rand_A]

X_raw_selected_full = X_raw_full[original_indices_for_random_A, :]
labels_selected_full = labels_all[original_indices_for_random_A]

combined_raw_full_data = pd.DataFrame({'wavenumber_full': wavenumbers_full})

for i, lab in enumerate(labels_selected_full):
    combined_raw_full_data[f'label_example_{i+1}'] = lab
    combined_raw_full_data[f'raw_full_example_{i+1}'] = X_raw_selected_full[i, :]

output_csv_path_raw_full = "comparison_raw_full_examples.csv"
combined_raw_full_data.to_csv(output_csv_path_raw_full, index=False)

print(f"Dados dos 5 exemplos brutos completos salvos em '{output_csv_path_raw_full}'")

1. Dividir em dois subconjuntos: A (treino) e B (aplicação)

Os dados brutos são divididos em dois subconjuntos: 'A' e 'B'.

*   **Conjunto A (70%)**: Será usado para treinar o Autoencoder. Dentro deste conjunto, uma parte será processada pelo método clássico para servir de 'padrão ouro' para o AE.
*   **Conjunto B (30%)**: Simula dados novos/desconhecidos, onde o Autoencoder treinado será aplicado para realizar o pré-processamento de forma automática e rápida, sem a necessidade de aplicar o pipeline clássico.

2. Salvar resultados

Finalmente, os resultados são salvos em arquivos CSV:

*   `A_padroes.csv`: Contém os rótulos e os espectros do conjunto A processados pelo pipeline clássico (o padrão ouro).
*   `A_AE.csv`: Contém os rótulos e os espectros do conjunto A processados pelo Autoencoder.
*   `B_AE.csv`: Contém os rótulos e os espectros do conjunto B processados pelo Autoencoder.
*   `wavenumbers_cut.csv`: Salva os números de onda correspondentes aos espectros cortados.
*   `comparison_examples.csv`: Salva os dados utilizados nos exemplos (com corte espectral no raw).
*   `comparison_raw_full_examples.csv`: : Salva os dados utilizados nos exemplos (sem corte espectral no raw).

#==============================================================================

1. Split into two subsets: A (training) and B (application)

The raw data is divided into two subsets: 'A' and 'B'.

- **Subset A (70%)**: Will be used to train the Autoencoder. Within this subset, a portion will be processed using the classical method to serve as a "gold standard" for the AE.
- **Subset B (30%)**: Simulates new/unseen data, where the trained Autoencoder will be applied to perform preprocessing automatically and quickly, without the need to run the classical pipeline.

2. Save results

Finally, the results are saved as CSV files:

- `A_patterns.csv`: Contains the labels and spectra of subset A processed by the classical pipeline (the gold standard).
- `A_AE.csv`: Contains the labels and spectra of subset A processed by the Autoencoder.
- `B_AE.csv`: Contains the labels and spectra of subset B processed by the Autoencoder.
- `wavenumbers_cut.csv`: Saves the wavenumbers corresponding to the cut spectra.
- `comparison_examples.csv`: Saves the data used in the examples (with spectral cut on the raw data).
- `comparison_raw_full_examples.csv`: Saves the data used in the examples (without spectral cut on the raw data).
