# FTIR Data Analysis Main Workflow
This notebook guides you through the main steps of the FTIR data analysis workflow, including file renaming, DataFrame creation or modification, and baseline correction parameter management.

Note: multiple interactive cells can't be open at the same time, so click 'close' when done with one.

## 1) Setup
### Import Statements
Import necessary libraries and modules for data analysis and visualization.

In [1]:
import pandas as pd
from Fixing_File_Names import batch_rename_files
from File_Info_Gathering import file_info_extractor
from IPython.display import display, HTML
from Fixing_File_Names import batch_rename_files
from Analysis_FTIR import (
    anchor_points_selection,
    normalization_peak_selection,
    plot_grouped_spectra,
    try_baseline,
    bring_in_DataFrame,
    test_baseline_choices,
    spectrum_normalization,
    baseline_correction,
    find_peak_info,
    peak_deconvolution
)
try:# Colab Compatibility
    from google.colab import output  # type: ignore 

    output.enable_custom_widget_manager()
    In_Colab = True
except Exception:
    In_Colab = False

### File Renaming
You can optionally rename files in your dataset.

This script scans a specified root directory and its subdirectories to find and rename files. Folder names will not be changed,except in the case of date renaming to ISO format (e.g., 2025-09-18) (optional). It works by replacing spaces and/or specified words in the filenames. (e.g., replacing spaces with underscores). Suggested to use this tool if file names have inconsistent naming conventions that may cause issues in downstream processing.

In [None]:
# Set directory to rename folders and files within (e.g., r"C:\Users\user1\folder1").
directory = r"C:\Users\user1\folder1"
# If you want to replace spaces in filenames, set replace_spaces to True and set 
# character_to_use to the desired separator (e.g., "_").
replace_spaces = False
character_to_use = "_"
# If you want to convert all dates in the directory names to ISO format (YYYY-MM-DD), 
# set iso_date_rename to True
iso_date_rename = False
# If you want to replace other specified words in filenames, set file_rename to True and
# provide pairs_input (e.g., "old1:new1,old2:new2").
file_rename = False
pairs_input = "old1:new1,old2:new2"
# If any of these parameters are set to None, you will be prompted for input (may result
# in multiple prompts and/or minor formatting issues).
# Rename files in the specified directory.
batch_rename_files(
    directory=directory,
    replace_spaces=replace_spaces,
    character_to_use=character_to_use,
    iso_date_rename=iso_date_rename,
    file_rename=file_rename,
    pairs_input=pairs_input,
)

### Load or create DataFrame
The DataFrame stores all of the relevant information on the spectra in an easily organized format. Processes within this Notebook often refer to the DataFrame within the working memory, and it is saved at the end of the Notebook. If running this for the first time, this cell creates a blank DataFrame that will be filled in by the next cell.

In [2]:
# Set path to your DataFrame CSV file. Leave as None if DataFrame is new or in default 
# location.
DataFrame_path = None

FTIR_DataFrame, DataFrame_path = bring_in_DataFrame(DataFrame_path=DataFrame_path)

### Fill or Append Spectra to DataFrame
Gathers file information and builds the main data structure for analysis. Repeated uses can append new data into the existing DataFrame.

The DataFrame will have a row for each spectrum file, with columns as follows:

File Location, File Name, Date, Conditions, Material, Time, X-Axis, Raw Data, Baseline Function, Baseline Parameters, Baseline, Baseline-Corrected Data, Normalization Peak Wavenumber, Normalized and Corrected Data

This function will append any files that aren't already included.
If FTIR_DataFrame is empty it will create it from scratch.

In [None]:
# Set directory containing files to analyze (e.g., r"C:\Users\user1\folder1").
directory = r"Y:\5200\Packaging Reliability\Durability Tool\Ray Tracing and Activation Spectrum\ATR-FTIR Data"
# Set file types to include (e.g., [".dpt", ".txt", ".csv"]).
file_types = ".dpt"
# Set separators to use when finding terms within filenames (e.g., ["_", " "])
separators = "_"
# Set material terms to search for in filenames (e.g., ["Si", "Perovskite", "Glass"]) 
# (case-insensitive).
material_terms = "CPC, t-PVDF, t-PVF, o-PVF, PPE, J-BOX#1, J-BOX#2, PO, PMMA"
# Set conditions terms to search for in filenames (e.g., ["A3", "A4", "B3", "B4"])
conditions_terms = "A3, A4, A5, 0.5X, 1X, 2.5X, 5X, ARC, OPN, KKCE, unexposed"
# Set append_missing to False to add only files which have all required information, or 
# True to add files even if some information is missing (may lead to issues downstream)
append_missing = False
# Set track_replicates to True to print the groups of replicate files
access_subdirectories = False
# Set access_subdirectories to False if you only want to search within folders in the
# specified directory that have dates as their names. This lets you avoid searching
# through unrelated folders that happen to be in the same directory.
track_replicates = False
# If any of these parameters are set to None, you will be prompted for input (may result
# in multiple prompts and/or minor formatting issues).

# Extract File Information and build or append to the main DataFrame.
FTIR_DataFrame = file_info_extractor(
    FTIR_DataFrame=FTIR_DataFrame,
    directory=directory,
    file_types=file_types,
    separators=separators,
    material_terms=material_terms,
    conditions_terms=conditions_terms,
    append_missing=append_missing,
    access_subdirectories=access_subdirectories,
    track_replicates=track_replicates,
)

### Display DataFrame

In [12]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
display(
    HTML(
        '<div style="height:500px;overflow:auto;">'
        + FTIR_DataFrame.to_html(max_rows=None, max_cols=None, notebook=True)
        + "</div>"
    )
)

Unnamed: 0,File Location,File Name,Date,Conditions,Material,Time,X-Axis,Raw Data,Baseline Function,Baseline Parameters,Baseline,Baseline-Corrected Data,Normalization Peak Wavenumber,Normalized and Corrected Data,Peak Wavenumbers,Peak Absorbances,Deconvolution Results
0,Y:\5200\Packaging Reliability\Durability Tool\...,CPC_A3_500hr.dpt,05-22-2024,A3,CPC,500,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[0.02034, 0.02032, 0.02046, 0.02068, 0.02069, ...",,,,,,,,,
1,Y:\5200\Packaging Reliability\Durability Tool\...,O-PVF_A3_500hr.dpt,05-22-2024,A3,o-PVF,500,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[0.02148, 0.02145, 0.02154, 0.02161, 0.02163, ...",,,,,,,,,
2,Y:\5200\Packaging Reliability\Durability Tool\...,T-PVDF_A3_500hr.dpt,05-22-2024,A3,t-PVDF,500,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[0.002, 0.0021, 0.00239, 0.00261, 0.0026, 0.00...",,,,,,,,,
3,Y:\5200\Packaging Reliability\Durability Tool\...,T-PVF_A3_500hr.dpt,05-22-2024,A3,t-PVF,500,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[0.0054, 0.00536, 0.00535, 0.00538, 0.00547, 0...",,,,,,,,,
4,Y:\5200\Packaging Reliability\Durability Tool\...,CPC_0.5X_1000hr.dpt,12-18-2024,0.5X,CPC,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-0.00232, -0.00235, -0.00243, -0.00251, -0.00...",,,,,,,,,
5,Y:\5200\Packaging Reliability\Durability Tool\...,CPC_1X_1000hr.dpt,12-18-2024,1X,CPC,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-0.00123, -0.00118, -0.0012, -0.00128, -0.001...",,,,,,,,,
6,Y:\5200\Packaging Reliability\Durability Tool\...,CPC_2.5X_1000hr.dpt,12-18-2024,2.5X,CPC,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-0.00045, -0.00032, -0.00021, -0.00025, -0.00...",,,,,,,,,
7,Y:\5200\Packaging Reliability\Durability Tool\...,CPC_5X_1000hr.dpt,12-18-2024,5X,CPC,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-3e-05, -0.0001, -3e-05, -2e-05, -0.00011, -0...",,,,,,,,,
8,Y:\5200\Packaging Reliability\Durability Tool\...,J-BOX#1_0.5X_1000hr.dpt,12-18-2024,0.5X,J-BOX#1,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-0.00201, -0.00204, -0.0021, -0.00217, -0.002...",,,,,,,,,
9,Y:\5200\Packaging Reliability\Durability Tool\...,J-BOX#1_1X_1000hr.dpt,12-18-2024,1X,J-BOX#1,1000,"[3998.33984, 3997.30987, 3996.2799, 3995.24993...","[-0.00069, -0.00074, -0.00075, -0.00081, -0.00...",,,,,,,,,


### Plot Spectra
Pick the material(s), condition(s), time(s) and which version of those files' data to plot.

A group plot is always created, but if separate_plots = True, then each spectrum will also be plotted individually.

If include_replicates = False, then the only first file found with those terms will be used.

In [None]:
# Set parameters for filtering and plotting
materials = "PPE"  # Example material
conditions = "A5"  # Example conditions
times = "500"  # Example time
raw_data = True
baseline = False
baseline_corrected = False
normalized = False
separate_plots = True
include_replicates = True
zoom = (
    None  # Set to "x_minimum-x_maximum" format, e.g., "400-4000", or None for no zoom
)

# Plot the grouped spectra
%matplotlib inline
plot_grouped_spectra(
    FTIR_DataFrame=FTIR_DataFrame,
    materials=materials,
    conditions=conditions,
    times=times,
    raw_data=raw_data,
    baseline=baseline,
    baseline_corrected=baseline_corrected,
    normalized=normalized,
    separate_plots=separate_plots,
    include_replicates=include_replicates,
    zoom=zoom,
)

## 2) Baseline Correction

You can choose a baseline approximation function for each different material that you have data for. It's recommended that you use 'ARPLS' with tweaked parameters or use 'Manual'. However, some datasets work better with different methods, so experiment if necessary.

Baseline Options:

Asymmetric Least Square 

    'ARPLS': asymmetrically reweighted penalized least squares smoothing-- an asymmetric least square method that uses a weighting function to account for noisy data.

Spline

    'IRSQR': iterative reweighted spline quantile regression-- uses penalized splines and iterative reweighted least squares to perform quantile regression.

Classification

    'FABC': fully automatic baseline correction-- uses first derivative approximation of data to identify and then ignore peak regions, then fits to baseline regions using Whittaker smoothing.

Manual

    'Manual': set "anchor points" for each of your materials using the built-in tool. This will create a list of wavenumber values that should always fall in the baseline regions for each spectrum of that material. A cubic spline interpolation will be done between those points' values in each scan.

### Baselines
Try out different baseline types and parameter options. The function will find the first file of your selected material with time == 0 (aka non-degraded) and display what the currently chosen settings will create for a baseline.

#### For non-manual baselines:
For custom parameters, move sliders or type desired value into box and hit 'enter'.

Accepts a filepath as an argument if you want to experiment with a specific file. In that case, "material" argument will be ignored.

Close when complete.

In [None]:
filepath = None  # If None, will find first Time-Zero file of the specified material
material = "PPE"  # Specify material to analyze (e.g., "PPE", with quotes).
baseline_function = "ARPLS"  # Specify baseline function to try (options: 
# "ARPLS", "IRSQR", "FABC", quotes included).
parameter_string = None  # For custom parameters, structure like so: 
# parameter_string="lam=100, quantile=0.05". Default parameters will be used if None.

try_baseline(
    FTIR_DataFrame,
    material=material,
    baseline_function=baseline_function,
    parameter_string=parameter_string,
    filepath=filepath,
)

#### For manual baseline:
If an error appears on repeated running of this cell, simply click 'continue' and scroll down-- the error will clear.

In [None]:
filepath = None  # If None, will find first Time-Zero file of the specified material
material = "CPC"
anchor_points_selection(
    FTIR_DataFrame,
    material=material,
    filepath=filepath,
    try_it_out=True
)

#### ARPLS Parameters
lam (float): Smoothness parameter (higher = smoother baseline).

diff_order (integer): Order of the differential matrix.

max_iter (integer): Max number of fit iterations.

tol (float): Exit criteria (accuracy goal).

#### IRSQR Parameters
lam (float): The smoothing parameter (higher = smoother baseline).

quantile (float): The quantile at which to fit the baseline (0 < quantile < 1).

num_knots (integer): The number of knots for the spline.

spline_degree (integer): The degree of the spline.

diff_order (integer): The order of the differential matrix. Must be greate matrix). Typical values are 3, 2, or 1.

max_iter (integer): The max number of fit iterations.

tol (float): Exit criteria (accuracy goal).

weights (array-like): The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1.

eps (float): A small value added to the square of the residual to prevent dividing by 0. Default is None, which uses the square of the maximum-absolute-value of the fit each iteration multiplied by 1e-6.

#### FABC Parameters
lam (float): The smoothing parameter (higher = smoother baseline).

scale (integer): The scale at which to calculate the continuous wavelet transform. Should be approximately equal to the index-based full-width-at-half-maximum of the peaks or features in the data. Default is None, which will use half of the value from :func:`.optimize_window`, which is not always a good value, but at least scales with the number of data points and gives a starting point for tuning the parameter.

num_std (float): The number of standard deviations to include when thresholding. Higher values
will assign more points as baseline.

diff_order (integer): The order of the differential matrix. Must be greater than 0. Typical values are 2 or 1.

min_length (integer): Any region of consecutive baseline points less than `min_length` is considered to be a false positive and all points in the region are converted to peak points. A higher `min_length` ensures less points are falsely assigned as baseline points. Default is 2, which only removes lone baseline points.

weights (array-like): The weighting array, used to override the function's baseline identification to designate peak points. Only elements with 0 or False values will have an effect; all non-zero values are considered baseline points. If None, then will be an array with size equal to N and all values set to 1.

weights_as_mask (bool): If True, signifies that the input `weights` is the mask to use for fitting, which skips the continuous wavelet calculation and just smooths the input data.

pad_kwargs (dict): A dictionary of keyword arguments to pass to :func:`.pad_edges` for padding the edges of the data to prevent edge effects from convolution for the continuous wavelet transform. Default is None.

#### Manual Parameters
anchor_points (float): The manually selected anchor points, from which the baseline is constructed via a cubic spline interpolation between them. The points are selected in one file from the regions that should always remain outside of peaks for that material, under reasonable degradation conditions. The points associated with these wavenumbers will be accessed in each file and a separate interpolation will be done for each one. So while the anchor points are the same in every spectrum, the actual baseline correction will be personalized for each.

### Test Baseline and Parameter Choices
Generates plots with the selected baseline and parameters for three random files of the specified material. Allows for quality check.

In [None]:
material = "PPE"  # Specify material to analyze (e.g., "PPE", with quotes).
test_baseline_choices(FTIR_DataFrame, material=material)

### Confirm & Calculate Choices
Calculates the baseline-corrected spectra and updates the DataFrame with the new data.

In [None]:
materials = "PPE"  # Specify materials to correct (e.g., "PPE, PVF", or "all").
FTIR_DataFrame = baseline_correction(FTIR_DataFrame, materials=materials)

## 3) Normalization

### Normalization Peak Selection
Select a peak for each material that does not change shape with time (aka does not degrade). Each spectrum of that material will be scaled so that the normalization peak is the same amplitude in each, giving a normalized set of spectra that can be more accurately compared to each other.

In [None]:
material = "PPE" # Specify material to analyze (e.g., "PPE", with quotes).
filepath = None  # If None, will find first Time-Zero file of the specified material. If
# specified, should be full file path as string with r"" (e.g., r"C:\path\to\file.dpt").
FTIR_DataFrame = normalization_peak_selection(
    FTIR_DataFrame, material=material, filepath=filepath
    )

### Spectra Normalization
Normalize the spectra of the chosen material to allow for further analysis.

In [None]:
material = "PPE"  # Specify material to analyze (e.g., "PPE", with quotes).
FTIR_DataFrame = spectrum_normalization(FTIR_DataFrame, material=material)

## 4) Peak Finding & Deconvolution

### Peak Finding
Utilize range 2 and range 3 within the interactive tool in order to specify multiple peak regions.

Close when complete.

In [None]:
materials = "PPE"  # Specify materials to find peaks for (e.g., "PPE, PVF", or "all").
filepath = None # If not None, will override material search and use specified filename
FTIR_DataFrame = find_peak_info(FTIR_DataFrame, materials=materials, filepath=filepath)

VBox(children=(HBox(children=(Dropdown(description='Spectrum', layout=Layout(width='70%'), options=(('PPE | 0.…

FigureWidget({
    'data': [{'mode': 'lines',
              'name': 'Normalized and Corrected',
              'type': 'scatter',
              'uid': 'dc201acf-e67b-4b61-855f-208feab2e612',
              'x': {'bdata': ('vyuC/608r0AG2EennjqvQE2EDU+POK' ... '7jwgEbeUA4oRABhwp5QHEDPj8M+nhA'),
                    'dtype': 'f8'},
              'y': {'bdata': ('nVYQQ06cLD8EaQJ5g9kQPwPTL1JPhy' ... 'Emeg96Tj9xTsBLRFlYv0J+syxAcWG/'),
                    'dtype': 'f8'}},
             {'marker': {'color': 'red', 'size': 9, 'symbol': 'x'},
              'mode': 'markers',
              'name': 'Peaks',
              'type': 'scatter',
              'uid': 'bfb0cd3b-0af9-4c11-ba81-1b7deb111e42',
              'x': [],
              'y': []}],
    'layout': {'template': '...',
               'title': {'text': 'Peak Selection (live)'},
               'xaxis': {'title': {'text': 'Wavenumber (cm⁻¹)'}},
               'yaxis': {'title': {'text': 'Absorbance (AU)'}}}
})

Output()

### Deconvolution
If a peak should be steeper (go towards zero faster as it leaves its center point), then decrease that peak's α value. If a peak should be wider, do the opposite. The "Reduced chi-square" value is an approximation of the error between the model and the data, but visual tests are also useful for that purpose.

In [None]:
materials = "PPE"
filepath = None
peak_deconvolution(FTIR_DataFrame, materials=materials, filepath=filepath)

## 5) Model Fitting

## 6) Results

### Save the DataFrame

In [None]:
# Save the DataFrame to CSV
DataFrame_path = DataFrame_path  # Specify the path to your DataFrame CSV file (default 
# will be FTIR_DataFrame.csv in the active directory)
FTIR_DataFrame.to_csv(DataFrame_path, index=False)
print(f"DataFrame saved to {DataFrame_path}")