### Imports

In [2]:
import os
import sys
import warnings
from pathlib import Path
import time

# import threading
# import multiprocessing
import concurrent.futures

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd

from scipy.signal import find_peaks

# import requests
# from bs4 import BeautifulSoup

CURRENT_DIR = Path(os.getcwd())

# Move to the root directory

ROOT_DIR = CURRENT_DIR.parents[0]  # Adjust the number based on your folder structure

RAW_FILE_PATH = ROOT_DIR / "wavelength_spectra_files"
RAW_FILE_PATH.mkdir(parents=True, exist_ok=True)
EXPORTS_FILE_PATH = ROOT_DIR / "exports"
EXPORTS_FILE_PATH.mkdir(parents=True, exist_ok=True)

# Add the root directory to the system path

sys.path.append(str(ROOT_DIR))

# Input Raw File and Decoder File

In [3]:
wafer_codes = ["QCHZZ", "QCI1M"]  # List of wafer codes
warnings.filterwarnings("ignore")

ANALYSIS_RUN_NAME = "SMSR_QC_FIRSTRUN"

SUBARU_DECODER = "QC WAFER_LAYOUT 24Dec.csv"
HALO_DECODER = "HALO_DECODER_NE-rev1_1 logic_coords_annotated.csv"


def find_files_with_wafer_codes(wafer_codes, raw_file_path):
    # Initialize an empty list to store the paths of the files
    file_paths = []
    # Iterate over each wafer code in the list
    for code in wafer_codes:
        # Iterate over each file in the raw file path directory
        for root, dirs, files in os.walk(raw_file_path):
            for file in files:
                # Check if the wafer code is in the file name
                if code in file:
                    # Add the full path of the file to the list
                    file_paths.append(Path(root) / file)
    return file_paths


# Call the function and print the results

file_paths = find_files_with_wafer_codes(wafer_codes, RAW_FILE_PATH)

print(file_paths)

[WindowsPath('c:/Users/762093/OneDrive - Seagate Technology/Documents/LIV/vadankhan-wavelength-spectra-analyser/wavelength_spectra_files/QCHZZ_spectra.csv'), WindowsPath('c:/Users/762093/OneDrive - Seagate Technology/Documents/LIV/vadankhan-wavelength-spectra-analyser/wavelength_spectra_files/QCI1M_spectra.csv')]


# Transform Data

### Transform

In [4]:
# Decoder loading function
def load_decoder(decoder_file_path):
    print(f"Loading decoder from: {decoder_file_path}")
    start_time = time.time()

    if not decoder_file_path.exists():
        print(f"Decoder file not found at {decoder_file_path}")
        return pd.DataFrame()

    df_decoder = pd.read_csv(decoder_file_path, usecols=["Logic_X", "Logic_Y", "TE_LABEL", "TYPE"])
    df_decoder = df_decoder.set_index(["Logic_X", "Logic_Y"])

    end_time = time.time()
    print(f"Loaded in {end_time - start_time:.2f} seconds.\n")
    return df_decoder


# Function to transform raw file and merge with decoder data
def transform_raw_file(filepath, wafer_id, decoder_df, wavelength_lb=824, wavelength_ub=832, chunksize=1000, max_chunks=400):
    """
    Processes the raw file in chunks and returns a generator of transformed chunk dataframes,
    along with the number of selected intensity columns.
    """
    print(f"Starting file transformation for {wafer_id}...")
    t0 = time.time()

    # Read column names from the file header
    col_names = pd.read_csv(filepath, nrows=1).columns
    intensity_cols = [col for col in col_names if col.startswith("Intensity_")]
    wavelengths = {col: float(col.split("_")[1]) for col in intensity_cols}
    selected_intensity_cols = [col for col, wl in wavelengths.items() if wavelength_lb <= wl <= wavelength_ub]
    usecols = ["X", "Y"] + selected_intensity_cols

    # Determine the number of selected intensity columns
    data_points_threshold = len(selected_intensity_cols)

    with pd.read_csv(filepath, chunksize=chunksize, usecols=usecols) as reader:
        for i, chunk in enumerate(reader):
            if i >= max_chunks:
                break

            # Reshape the data into long format (one row per measurement)
            long_df = chunk.melt(id_vars=["X", "Y"], value_vars=selected_intensity_cols, var_name="Wavelength", value_name="Intensity")
            # Convert the Wavelength column from string to actual float values
            long_df["Wavelength"] = long_df["Wavelength"].map(wavelengths)

            # Merge with the decoder data using the spatial coordinates
            long_df = long_df.merge(decoder_df, left_on=["X", "Y"], right_index=True, how="left")

            # Drop raw coordinate columns and reorder the columns
            long_df = long_df.drop(columns=["X", "Y"])
            long_df = long_df[["TYPE", "TE_LABEL", "Wavelength", "Intensity"]]

            yield long_df, data_points_threshold

    t1 = time.time()
    print(f"File transformation for {wafer_id} completed in {t1 - t0:.2f} seconds.")


def extract_top_two_peaks(df_group):
    """
    Detects the top two peaks in a spectrum.
    - SMSR_dB is calculated as -secondary_peak_dB (with main peak at 0 dB)
    - SMSR_linear is calculated from the ratio of linear intensities
    """
    df_sorted = df_group.sort_values("Wavelength")
    intensities = df_sorted["Intensity"].values
    dB_intensities = df_sorted["dB_Intensity"].values
    wavelengths = df_sorted["Wavelength"].values

    peak_indices, _ = find_peaks(dB_intensities)
    if len(peak_indices) == 0:
        return pd.Series(
            {
                "highest_peak_wavelength": np.nan,
                "highest_peak_intensity_linear": np.nan,
                "second_peak_wavelength": np.nan,
                "second_peak_intensity_linear": np.nan,
                "SMSR_dB": np.nan,
                "SMSR_linear": np.nan,
            }
        )

    # Sort peaks by dB intensity descending
    sorted_order = np.argsort(dB_intensities[peak_indices])[::-1]
    highest_idx = peak_indices[sorted_order[0]]
    highest_peak_wavelength = wavelengths[highest_idx]
    highest_peak_intensity_linear = intensities[highest_idx]

    if len(sorted_order) > 1:
        second_idx = peak_indices[sorted_order[1]]
        second_peak_wavelength = wavelengths[second_idx]
        second_peak_intensity_linear = intensities[second_idx]
        second_peak_dB = dB_intensities[second_idx]

        SMSR_dB = -second_peak_dB
        SMSR_linear = highest_peak_intensity_linear / second_peak_intensity_linear
    else:
        second_peak_wavelength = np.nan
        second_peak_intensity_linear = np.nan
        SMSR_dB = np.nan
        SMSR_linear = np.nan

    return pd.Series(
        {
            "highest_peak_wavelength": highest_peak_wavelength,
            "highest_peak_intensity_linear": highest_peak_intensity_linear,
            "second_peak_wavelength": second_peak_wavelength,
            "second_peak_intensity_linear": second_peak_intensity_linear,
            "SMSR_dB": SMSR_dB,
            "SMSR_linear": SMSR_linear,
        }
    )


def process_export_and_peaks(filepath, wafer_code, decoder_df):
    """
    Processes the raw file in chunks, exports the transformed data with dB intensity,
    and performs peak detection when sufficient data points have been accumulated for each TE_LABEL.
    """
    print(f"\n=== Starting processing for {wafer_code} ===")
    total_t0 = time.time()

    # Output path for main spectra data
    spectra_output_path = EXPORTS_FILE_PATH / f"{ANALYSIS_RUN_NAME}_{wafer_code}_spectra_formatted.csv"
    peak_output_path = EXPORTS_FILE_PATH / f"{ANALYSIS_RUN_NAME}_{wafer_code}_peaks_summary.csv"
    first_chunk = True
    first_peak = True

    # Initialize buffers
    accumulator = {}
    data_point_count = {}

    # Stream file in chunks
    for chunk, data_points_threshold in transform_raw_file(filepath, wafer_code, decoder_df):
        # Accumulate by TE_LABEL
        for te_label, group in chunk.groupby("TE_LABEL"):
            if te_label not in accumulator:
                accumulator[te_label] = [group]
                data_point_count[te_label] = len(group)
            else:
                accumulator[te_label].append(group)
                data_point_count[te_label] += len(group)

            # Check if all data for this TE_LABEL has been accumulated
            if data_point_count[te_label] >= data_points_threshold:
                # Concatenate all accumulated data for this TE_LABEL
                full_data = pd.concat(accumulator[te_label], ignore_index=True)

                # Compute the maximum intensity for this TE_LABEL
                max_intensity = full_data["Intensity"].max()

                # Calculate dB Intensity
                full_data["dB_Intensity"] = 10 * np.log10(full_data["Intensity"] / max_intensity)

                # Export the processed data with dB intensity
                full_data.to_csv(spectra_output_path, mode="w" if first_chunk else "a", header=first_chunk, index=False)
                first_chunk = False

                # Perform peak detection
                peak_series = extract_top_two_peaks(full_data)
                peak_series["TE_LABEL"] = te_label

                # Append peak data to the peak summary file
                pd.DataFrame([peak_series]).to_csv(peak_output_path, mode="w" if first_peak else "a", header=first_peak, index=False)
                first_peak = False

                # Clear buffer for this TE_LABEL
                del accumulator[te_label]
                del data_point_count[te_label]

    total_t1 = time.time()
    print(f"=== Completed processing {wafer_code} in {total_t1 - total_t0:.2f} seconds ===")


# Main execution
start_total_time = time.time()
print("\n=== Starting full processing run ===\n")

print(EXPORTS_FILE_PATH)

# file_paths should be defined elsewhere, assumed to match wafer_codes order
for filepath, wafer_code in zip(file_paths, wafer_codes):
    # Extract product code from the first two characters of the wafer code
    product_code = wafer_code[:2]

    print(f"\n--- Processing wafer: {wafer_code} (Product: {product_code}) ---")

    # Select decoder file based on product code
    if product_code == "QC":
        decoder_path = ROOT_DIR / "decoders" / SUBARU_DECODER
    elif product_code == "QD" or "NV":
        decoder_path = ROOT_DIR / "decoders" / HALO_DECODER
    else:
        print(f"Unsupported product code: {product_code}")
        continue

    # Load decoder and process
    decoder_dict = load_decoder(decoder_path)
    process_export_and_peaks(filepath, wafer_code, decoder_dict)


end_total_time = time.time()
total_time = end_total_time - start_total_time


print(f"\n=== Total processing time: {total_time:.2f} seconds ===\n")


=== Starting full processing run ===

c:\Users\762093\OneDrive - Seagate Technology\Documents\LIV\vadankhan-wavelength-spectra-analyser\exports

--- Processing wafer: QCHZZ (Product: QC) ---
Loading decoder from: c:\Users\762093\OneDrive - Seagate Technology\Documents\LIV\vadankhan-wavelength-spectra-analyser\decoders\QC WAFER_LAYOUT 24Dec.csv
Loaded in 0.88 seconds.


=== Starting processing for QCHZZ ===
Starting file transformation for QCHZZ...
File transformation for QCHZZ completed in 2733.49 seconds.
=== Completed processing QCHZZ in 2733.49 seconds ===

--- Processing wafer: QCI1M (Product: QC) ---
Loading decoder from: c:\Users\762093\OneDrive - Seagate Technology\Documents\LIV\vadankhan-wavelength-spectra-analyser\decoders\QC WAFER_LAYOUT 24Dec.csv
Loaded in 0.60 seconds.


=== Starting processing for QCI1M ===
Starting file transformation for QCI1M...
File transformation for QCI1M completed in 2481.76 seconds.
=== Completed processing QCI1M in 2481.76 seconds ===

=== Total 

### Experimental: Sort in Python (slower than JMP)

In [None]:
# Sort final output file by TE_LABEL
def sort_large_csv_with_dask(input_path, output_path):
    print(f"Starting Dask sort for: {input_path}")
    start_time = time.time()

    # Read CSV with Dask
    df = dd.read_csv(input_path, assume_missing=True)  # assume_missing=True is safe for mixed data

    # Sort by TE_LABEL first, then Wavelength
    df_sorted = df.sort_values(by=["TE_LABEL", "Wavelength"])

    # Save to CSV (can write to multiple files if very large)
    df_sorted.to_csv(output_path, index=False, single_file=True)

    end_time = time.time()
    print(f"Sorting and export completed in {end_time - start_time:.2f} seconds.")


# Call for each wafer (in case of multiple)
for wafer_code in wafer_codes:
    input_csv = EXPORTS_FILE_PATH / f"{ANALYSIS_RUN_NAME}_{wafer_code}_spectra_formatted.csv"
    output_csv = EXPORTS_FILE_PATH / f"{ANALYSIS_RUN_NAME}_{wafer_code}_sorted.csv"
    sort_large_csv_with_dask(input_csv, output_csv)

### Experimental: Threading Attempt

In [None]:
ANALYSIS_RUN_NAME = "larger"

RAW_FILE_PATH = ROOT_DIR / "wavelength_spectra_files"
EXPORTS_FILE_PATH = ROOT_DIR / "exports"


def transform_raw_file(filepath, wafer_id, wavelength_lb=827, wavelength_ub=830, chunksize=1000, max_chunks=1000):
    col_names = pd.read_csv(filepath, nrows=1).columns  # Read just headers
    intensity_cols = [col for col in col_names if col.startswith("Intensity_")]

    # Extract wavelengths from column names
    wavelengths = {col: float(col.split("_")[1]) for col in intensity_cols}

    # Filter columns to only include those within the desired range
    selected_intensity_cols = [col for col, wl in wavelengths.items() if wavelength_lb <= wl <= wavelength_ub]

    # Define columns to read
    usecols = ["X", "Y"] + selected_intensity_cols

    with pd.read_csv(filepath, chunksize=chunksize, usecols=usecols) as reader:
        for i, chunk in enumerate(reader):
            if i >= max_chunks:
                break  # Stop after processing max_chunks

            # Melt the dataframe: Convert wide format to long format
            long_df = chunk.melt(id_vars=["X", "Y"], value_vars=selected_intensity_cols, var_name="Wavelength", value_name="Intensity")

            # Convert "Wavelength" column from "Intensity_xxx" to just "xxx"
            long_df["Wavelength"] = long_df["Wavelength"].map(wavelengths)

            yield long_df  # Yield processed chunk instead of storing in memory


def process_and_export(filepath, wafer_code, export_path, run_name):
    try:
        print(f"Processing file: {filepath} for wafer: {wafer_code}")
        output_path = export_path / f"{run_name}_threaded_{wafer_code}_spectra_formatted.csv"
        first_chunk = True

        for transformed_chunk in transform_raw_file(filepath, wafer_code):
            # export_start_time = time.time()
            transformed_chunk.to_csv(output_path, mode="w" if first_chunk else "a", header=first_chunk, index=False)
            first_chunk = False  # Only write header for the first chunk
            # print(f"Chunk exported for wafer {wafer_code}. Time taken: {time.time() - export_start_time:.2f} seconds.")
    except Exception as e:
        print(f"Error processing wafer {wafer_code}: {e}")


def parallel_processing(filepaths, wafer_codes, export_path, run_name):
    print(f"Starting parallel processing with {len(filepaths)} files.")
    # Use ThreadPoolExecutor instead of ProcessPoolExecutor
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_and_export, f, w, export_path, run_name): (f, w) for f, w in zip(filepaths, wafer_codes)}
        for future in concurrent.futures.as_completed(futures):
            filepath, wafer_code = futures[future]
            try:
                future.result()  # This will raise any exception raised in the process
            except Exception as e:
                print(f"Error with file {filepath}, wafer {wafer_code}: {e}")


# CALLING THE CODE
if __name__ == "__main__":
    start_total_time = time.time()
    print("\n=== Starting full processing run with multiprocessing ===\n")
    parallel_processing(file_paths, wafer_codes, EXPORTS_FILE_PATH, ANALYSIS_RUN_NAME)
    end_total_time = time.time()
    total_time = end_total_time - start_total_time
    print(f"\n=== Total processing time: {total_time:.2f} seconds ===\n")

# Experimental: Spectrum Plotting

In [None]:
def plot_sweep_data(df_raw_sweeps, wafer_code):
    cod_roll_eval_types = ["COD", "ROLLOVER", "NO LASER"]
    colors = ["red", "blue", "orange"]

    # Scatter plot of PD against LDI_mA with subplots for each COD_ROLL_EVAL type
    fig, axs = plt.subplots(1, 3, figsize=(18, 4))
    for ax, cod_type, color in zip(axs, cod_roll_eval_types, colors):
        group = df_raw_sweeps[df_raw_sweeps["COD_ROLL_EVAL"] == cod_type]
        ax.scatter(group["LDI_mA"], group["PD"], alpha=0.8, s=0.2, color=color)
        ax.set_title(f"Scatter Plot of PD vs LDI_mA for {cod_type} (Wafer Code: {wafer_code})")
        ax.set_xlabel("LDI_mA")
        ax.set_ylabel("PD")
        ax.grid(True)
    plt.tight_layout()
    plt.show()

    # Plot DP/DI against LDI_mA with subplots for each COD_ROLL_EVAL type
    fig, axs = plt.subplots(1, 3, figsize=(18, 4))
    for ax, cod_type, color in zip(axs, cod_roll_eval_types, colors):
        group = df_raw_sweeps[df_raw_sweeps["COD_ROLL_EVAL"] == cod_type]
        ax.scatter(group["LDI_mA"], group["DP/DI"], alpha=0.8, s=0.2, color=color)
        ax.set_title(f"Scatter Plot of DP/DI vs LDI_mA for {cod_type} (Wafer Code: {wafer_code})")
        ax.set_xlabel("LDI_mA")
        ax.set_ylabel("DP/DI")
        ax.grid(True)
    plt.tight_layout()
    plt.show()

    # Scatter plot of Vf against LDI_mA with subplots for each COD_ROLL_EVAL type
    fig, axs = plt.subplots(1, 3, figsize=(18, 4))
    for ax, cod_type, color in zip(axs, cod_roll_eval_types, colors):
        group = df_raw_sweeps[df_raw_sweeps["COD_ROLL_EVAL"] == cod_type]
        ax.scatter(group["LDI_mA"], group["Vf"], alpha=0.8, s=0.2, color=color)
        ax.set_title(f"Scatter Plot of Vf vs LDI_mA for {cod_type} (Wafer Code: {wafer_code})")
        ax.set_xlabel("LDI_mA")
        ax.set_ylabel("Vf")
        ax.grid(True)
    plt.tight_layout()
    plt.show()

    # Plot dV/dI against LDI_mA with subplots for each COD_ROLL_EVAL type
    fig, axs = plt.subplots(1, 3, figsize=(18, 4))
    for ax, cod_type, color in zip(axs, cod_roll_eval_types, colors):
        group = df_raw_sweeps[df_raw_sweeps["COD_ROLL_EVAL"] == cod_type]
        ax.scatter(group["LDI_mA"], group["dV/dI"], alpha=0.8, s=0.2, color=color)
        ax.set_title(f"Scatter Plot of dV/dI vs LDI_mA for {cod_type} (Wafer Code: {wafer_code})")
        ax.set_xlabel("LDI_mA")
        ax.set_ylabel("dV/dI")
        ax.grid(True)
    plt.tight_layout()
    plt.show()


# Calling Code
for df_raw_sweeps in annotated_sweeps_tables:
    wafer_code = df_raw_sweeps["WAFER_ID"].iloc[0]
    plot_sweep_data(df_raw_sweeps, wafer_code)