# ** Partial Least Squares Regression Model **
<div style="margin-top:10px; text-align:justify";>
This analysis uses Partial Least Squares Regression (PLSR) to predict certified elemental concentrations in aluminium alloy samples based on spectral peak intensities extracted from SNV/MSC/TSN-normalized emission spectra.<br></br>
🎯 Objective:<br></br>
To develop a calibration model that maps selected spectral emission line intensities to certified reference concentrations (%) of elements such as Cu, Mn, Mg, Si, Zn etc.<br></br>
📈 Method Overview <br></br>
⚛Inputs:

- peak_max_df: A dictionary of DataFrames containing maximum intensities at specific emission lines.

- element_concentrations: Certified concentration values (%) for each element.

- emission_lines: Selected emission wavelengths for the element being modeled.

⚛Workflow:

- Extracts peak intensities from selected lines for the target element.

- Fits a PLS regression model (with optional automatic component selection).

- Predicts concentrations and calculates:

     - Absolute Error

     - Relative Error (%)

     - RMSE, MAE, and R² scores

     - Visualizes results using Bokeh:

     - Scatter plot of predicted vs. actual concentrations

     - Error bars (± absolute error)

     - Summary statistics

     - Interactive data table of predictions

</div>

In [1]:
element_concentrations = {
    'Al': [..........................], 👈#Input element concentrations here
    'Cu': [..........................],
    'Zn': [..........................],
    'Mn': [..........................],
    'Fe': [..........................],
    'Mg': [..........................],
    'Si': [..........................],
    'Ni': [..........................]
}

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, Label, DataTable, TableColumn, NumberFormatter, PreText, Legend
from bokeh.layouts import column
from bokeh.transform import factor_mark
from bokeh.io import output_file
import numpy as np
import pandas as pd

output_file("bokeh_plsr.html")

# Sample color/marker mapping (customize as needed)
def assign_sample_colors(sample_labels):
    style_map = {
        "BAM-308": ("blue", "circle"),
        "BAM-311": ("blue", "square"),
        "BAM-M308a": ("blue", "triangle"),
        "BAM-M318": ("blue", "diamond"),
        "ERM-EB313": ("red", "circle"),
        "ERM-EB314a": ("red", "square"),
        "ERM-EB315a": ("red", "triangle"),
        "ERM-EB317": ("red", "diamond"),
    }
    default = ("gray", "x")
    return [style_map.get(label, default) for label in sample_labels]

# PLSR function (no cross-validation)
def perform_plsr_with_table_no_cv(
    element,
    emission_lines,
    peak_max_df,
    element_concentrations,
    cap_width=0.03,
    n_components="auto",
    return_model=False
):
    df = peak_max_df[element]
    X = df[emission_lines].values
    y = np.array(element_concentrations[element], dtype=float)
    sample_labels = df.index.tolist()

    # Automatically select optimal n_components
    if n_components == "auto":
        best_rmse = float("inf")
        best_n = 1
        for n in range(1, min(len(emission_lines), len(y)) + 1):
            model = make_pipeline(StandardScaler(), PLSRegression(n_components=n))
            model.fit(X, y)
            y_pred_try = model.predict(X).ravel()
            rmse = np.sqrt(mean_squared_error(y, y_pred_try))
            if rmse < best_rmse:
                best_rmse = rmse
                best_n = n
        n_components = 2

    # Final model
    model = make_pipeline(StandardScaler(), PLSRegression(n_components=n_components))
    model.fit(X, y)
    y_pred = model.predict(X).ravel()

    abs_error = np.abs(y_pred - y)
    rel_error = np.where(y == 0, np.nan, abs_error / y * 100)
    pred_error_str = [f"{pred:.2f} ± {err:.2f}" for pred, err in zip(y_pred, abs_error)]

    sample_colors, sample_markers = zip(*assign_sample_colors(sample_labels))
    source_data = {
        "x": y,
        "y": y_pred,
        "sample": sample_labels,
        "color": sample_colors,
        "marker": sample_markers,
        "lower": y_pred - abs_error,
        "upper": y_pred + abs_error,
        "x_left": y - cap_width,
        "x_right": y + cap_width,
        "Observed (%)": y,
        "Predicted (%)": y_pred,
        "Abs Deviation": abs_error,
        "Rel Deviation (%)": rel_error,
        "Predicted ± Error": pred_error_str
    }
    source = ColumnDataSource(data=source_data)

    # Plot
    min_val = min(min(y), min(y_pred))
    max_val = max(max(y), max(y_pred))

    p = figure(title=f"PLSR for {element}\n{' + '.join(emission_lines)}",
               x_axis_label="Certified Concentration (%)",
               y_axis_label="Predicted Concentration (%)",
               width=980, height=600)

    p.scatter('x', 'y', source=source, size=16,
              color='color',
              marker=factor_mark('marker',
                                 markers=["circle", "square", "triangle", "diamond", "x"],
                                 factors=["circle", "square", "triangle", "diamond", "x"]),
              legend_field='sample')

    p.segment(x0='x', y0='lower', x1='x', y1='upper', source=source, line_color="black", line_width=2)
    p.segment(x0='x_left', y0='lower', x1='x_right', y1='lower', source=source, line_color="black", line_width=2)
    p.segment(x0='x_left', y0='upper', x1='x_right', y1='upper', source=source, line_color="black", line_width=2)
    p.line([min_val, max_val], [min_val, max_val], line_dash='dashed', color='gray', legend_label="Perfect Fit")

    label = Label(
        x=max_val * 0.4,
        y=min_val * 1.05,
        text=f"R²: {r2_score(y, y_pred):.3f}\nRMSE: {np.sqrt(mean_squared_error(y, y_pred)):.3f}\nMAE: {mean_absolute_error(y, y_pred):.3f}",
        text_font_size='14pt',
        background_fill_color='white',
        background_fill_alpha=0.7
    )
    p.add_layout(label)
    p.legend.visible = False

    columns = [
        TableColumn(field="sample", title="Sample"),
        TableColumn(field="Observed (%)", title="Observed", formatter=NumberFormatter(format="0.000")),
        TableColumn(field="Predicted (%)", title="Predicted", formatter=NumberFormatter(format="0.000")),
        TableColumn(field="Abs Deviation", title="Abs Error", formatter=NumberFormatter(format="0.000")),
        TableColumn(field="Rel Deviation (%)", title="Rel Error (%)", formatter=NumberFormatter(format="0.00")),
        TableColumn(field="Predicted ± Error", title="Predicted ± Error")
    ]
    data_table = DataTable(source=source, columns=columns, width=850, height=300)

    summary = PreText(text=f"PLSR for {element} using {len(emission_lines)} lines\n"
                           f"R²: {r2_score(y, y_pred):.3f}, RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.4f}, "
                           f"MAE: {mean_absolute_error(y, y_pred):.4f}, Relative RMSE: {(np.sqrt(mean_squared_error(y, y_pred)) / np.mean(y)) * 100:.2f}%",
                      width=850)

    show(column(p, data_table, summary))

    if return_model:
        return pd.DataFrame(source_data), model
    return pd.DataFrame(source_data)

results, model = perform_plsr_with_table_no_cv(
    element="Cu",
    emission_lines=["Cu 324.75 nm", "Cu 327.40 nm", "Cu 510.55 nm"],
    peak_max_df=peak_max_dict,  # This is the dictionary returned by extract_peak_intensities()
    element_concentrations=element_concentrations,
    return_model=True
)