<a href="https://colab.research.google.com/github/Raoina/Spectra-2-Image/blob/main/notebooks/processing_dataset/Full_Preprocessing_Pipeline_(Transmission_%E2%86%92_Absorbance_%2B_Savitzky_Golay_%2B_Auto_scaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ====== 2. Preprocessing (Full Dataset, No Split or Shuffle, Keep Headers) ==================
import numpy as np
import pandas as pd
import json
from scipy.signal import savgol_filter

# ---------- Step 1: Transmission → Absorbance ----------
def transmission_to_absorbance(X):
    X = np.clip(X, 1e-10, None)
    return np.log10(1 / X)

# ---------- Step 2: Savitzky–Golay Derivative ----------
def savgol_derivative(X, win=9, poly=7, deriv=1):
    n_cols = X.shape[1]
    if win % 2 == 0:
        win += 1
    if win > n_cols:
        win = n_cols - 1 if (n_cols - 1) % 2 == 1 else n_cols - 2
        if win < 3:
            win = 3
    return savgol_filter(X, window_length=win, polyorder=poly, deriv=deriv, axis=1)

# ---------- Step 3: Auto-scaling ----------
def auto_scale(X, mean=None, std=None):
    if mean is None or std is None:
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
    if np.isscalar(std):
        if std == 0:
            std = 1e-10
    else:
        std[std == 0] = 1e-10
    return (X - mean) / std, mean, std

# ---------- Main Preprocessing Function ----------
def preprocess_full_absorbance_sg_autoscale_no_snv(X, y):
    # Convert to numpy arrays
    X_array = np.array(X, dtype=float)
    y_array = np.array(y, dtype=float)
    if y_array.ndim == 1:
        y_array = y_array.reshape(-1, 1)

    # Step 1: Transmission → Absorbance
    X_abs = transmission_to_absorbance(X_array)

    # Step 2: Savitzky–Golay Derivative
    X_sg = savgol_derivative(X_abs)

    # Step 3: Auto-scale X
    X_scaled, mean_x, std_x = auto_scale(X_sg)

    # Step 4: Auto-scale y
    y_scaled, mean_y, std_y = auto_scale(y_array)

    return X_scaled, y_scaled, mean_x, std_x, mean_y, std_y


# ================== Example Usage ==================
# ---- Replace these lines with your actual data ----
X = pd.read_csv("/content/spectra_train_488.csv")
y = pd.read_csv("/content/target_train_488.csv")

# Keep original headers
x_headers = X.columns
y_headers = y.columns

# Run preprocessing
X_proc, y_proc, mean_x, std_x, mean_y, std_y = preprocess_full_absorbance_sg_autoscale_no_snv(X, y)

# ---------- Save preprocessed data with original headers ----------
pd.DataFrame(X_proc, columns=x_headers).to_csv("X_preprocessed_no_snv.csv", index=False)
pd.DataFrame(y_proc, columns=y_headers).to_csv("y_preprocessed_no_snv.csv", index=False)

print("\n✅ Full dataset preprocessed and saved successfully (headers kept):")
print("• X_preprocessed_no_snv.csv")
print("• y_preprocessed_no_snv.csv")


✅ Full dataset preprocessed and saved successfully (headers kept):
• X_preprocessed_no_snv.csv
• y_preprocessed_no_snv.csv
