# ** Data Loading **

<div style="margin-top: 10px; text-align: justify;">This code loads and organizes raw LIBS spectral data from a folder of .txt files. Each file contains wavelength and intensity values for a sample measurement.

What the code does:

- Searches each sample folder for .txt files

- Reads and checks that the files are in the correct format

- Sorts the files in natural order. 

- Combines the data into one large table (DataFrame) with:

- Wavelengths as rows

- Sample measurements as columns

- Fills in any missing data using linear interpolation

Returns:

- A list of all sample names

- A clean, ready-to-use DataFrame of intensity values</div>

In [None]:
def natural_key(text):
    """Generate a key for natural sorting of strings with embedded numbers (e.g., 'G7', 'G11')."""
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'([0-9]+)', text)]

def load_data(batch_path):
   
    sample_data = {}
    all_wavelengths = set()

    # Collect and sort all sample folders
    samples = sorted([f for f in os.listdir(batch_path) if os.path.isdir(os.path.join(batch_path, f))], key=natural_key)

    for sample in samples:
        sample_path = os.path.join(batch_path, sample)
        txt_files = sorted(
            [f for f in os.listdir(sample_path) if f.endswith('.txt')],
            key=natural_key
        )

        sample_intensities = []
        measurement_names = []

        for file_name in txt_files:
            file_path = os.path.join(sample_path, file_name)

            try:
                df = pd.read_csv(file_path, delimiter=';', header=None, names=['wavelength', 'intensity'])
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            if df.empty or df.shape[1] != 2:
                print(f"Invalid format in {file_path}. Skipping.")
                continue

            df.set_index('wavelength', inplace=True)
            all_wavelengths.update(df.index)
            sample_intensities.append(df['intensity'])

            base_name = os.path.splitext(file_name)[0]
            measurement_names.append(f"{sample}_{base_name}")

        if sample_intensities:
            # Ensure consistent column ordering
            sorted_pairs = sorted(zip(measurement_names, sample_intensities), key=lambda x: natural_key(x[0]))
            names_sorted, intensities_sorted = zip(*sorted_pairs)
            sample_df = pd.concat(intensities_sorted, axis=1)
            sample_df.columns = names_sorted
            sample_data[sample] = sample_df
        else:
            print(f"No valid data found for sample '{sample}'.")

    # Build combined result DataFrame over common wavelength index
    common_wavelengths = sorted(all_wavelengths)
    result_df = pd.DataFrame(index=common_wavelengths)

    for sample in samples:
        if sample in sample_data:
            df = sample_data[sample].reindex(common_wavelengths)
            df = df.interpolate(method='linear', limit_direction='both')
            df = df.reindex(sorted(df.columns, key=natural_key), axis=1)
            result_df = pd.concat([result_df, df], axis=1)

    # Final formatting
    result_df.reset_index(inplace=True)
    result_df.rename(columns={'index': 'wavelength'}, inplace=True)

    # Ensure column ordering is preserved
    ordered_cols = ['wavelength'] + sorted(result_df.columns[1:], key=natural_key)
    result_df = result_df[ordered_cols]

    return samples, result_df

# --- Usage Example ---
BASE_PATH = 'C:/Users/KA/OneDrive/Desktop/BAMAN/28.01.2025 BAM-Alu'  # Adjust as needed

SAMPLES, SAMPLE_Raw_df = load_data(BASE_PATH)

print("Loaded Samples:", SAMPLES)
# SAMPLE_Raw_df.head()
# SAMPLE_Raw_df.to_csv("LIBS_Raw_df.csv", index=False)