# ** Leave-One-Out Cross-Validation **
<div style="margin-top:10px; text-align:justify";>

The Partial Least Squares Regression (PLSR) model was first trained on the dataset. Model validation was then performed using Leave-One-Out Cross-Validation (LOOCV). In this approach, one sample is left out in each iteration, the model is trained on the remaining samples, and tested on the left-out sample. This process is repeated for every sample in the dataset.

LOOCV is particularly suitable for small datasets as it provides an unbiased estimate of the model’s predictive performance on new data.
</div>

In [2]:
element_concentrations = {
    'Al': [..........................], 👈#Input element concentrations here
    'Cu': [..........................],
    'Zn': [..........................],
    'Mn': [..........................],
    'Fe': [..........................],
    'Mg': [..........................],
    'Si': [..........................],
    'Ni': [..........................]
}

In [3]:
def perform_plsr_loocv_full_report(
    element,
    emission_lines,
    peak_max_df,
    element_concentrations=None,
    assign_sample_colors=None,
    cap_width=0.006,
    return_model=False,
    fixed_n_components=None  # Optional manual override
):
    import numpy as np
    import pandas as pd
    from sklearn.cross_decomposition import PLSRegression
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from sklearn.model_selection import LeaveOneOut

    if element_concentrations is None or element not in element_concentrations:
        raise ValueError(f"Missing concentration values for element '{element}'.")

    df = peak_max_df[element]
    X = df[emission_lines].values
    y = np.array(element_concentrations[element], dtype=float)
    sample_labels = df.index.tolist()

    n_samples, n_features = X.shape
    max_n = min(n_samples - 1, n_features)

    metrics_summary = []
    details_by_n = {}

    print(f"LOOCV performance for element '{element}' with {max_n} max components:")

    n_range = [fixed_n_components] if fixed_n_components else range(1, max_n + 1)

    for n in n_range:
        y_true_cv = []
        y_pred_cv = []
        detailed_records = []

        loo = LeaveOneOut()
        for train_idx, test_idx in loo.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            test_sample = sample_labels[test_idx[0]]
            train_samples = [sample_labels[i] for i in train_idx]

            model = make_pipeline(StandardScaler(), PLSRegression(n_components=n))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)[0, 0]
            y_true = y_test[0]

            y_true_cv.append(y_true)
            y_pred_cv.append(y_pred)

            abs_error = abs(y_true - y_pred)
            rel_error = abs_error / y_true * 100 if y_true != 0 else np.nan

            detailed_records.append({
                "Left-Out Sample": test_sample,
                "Training Samples": train_samples,
                "True": round(y_true, 4),
                "Predicted": round(y_pred, 4),
                "Abs Error": round(abs_error, 4),
                "Rel Error (%)": round(rel_error, 2)
            })

        r2 = r2_score(y_true_cv, y_pred_cv)
        rmse = mean_squared_error(y_true_cv, y_pred_cv, squared=False)
        mae = mean_absolute_error(y_true_cv, y_pred_cv)

        print(f"  n_components={n} | RMSECV={rmse:.4f}, R²={r2:.4f}, MAE={mae:.4f}")

        metrics_summary.append({
            "n_components": n,
            "RMSECV": rmse,
            "R2": r2,
            "MAE": mae
        })

        details_by_n[n] = pd.DataFrame(detailed_records)

    summary_df = pd.DataFrame(metrics_summary)

    if fixed_n_components:
        best_n = fixed_n_components
    else:
        best_n = summary_df.loc[summary_df["RMSECV"].idxmin(), "n_components"]

    best_details_df = details_by_n[best_n]

    print(f"\nBest n_components: {best_n} with RMSECV = {summary_df.loc[summary_df['n_components'] == best_n, 'RMSECV'].values[0]:.4f}")
    print(f"\n✅ The best number of components is: {best_n}")

    final_model = None
    if return_model:
        final_model = make_pipeline(StandardScaler(), PLSRegression(n_components=best_n))
        final_model.fit(X, y)

    if return_model:
        return summary_df, best_n, best_details_df, final_model
    else:
        return summary_df, best_n, best_details_df


# -------------------------------
# AUTO-RUN FIRST
# -------------------------------

summary_df, best_n, details_df = perform_plsr_loocv_full_report(
    element="Mn",
    emission_lines=["Mn 403.08 nm", "Mn 403.31 nm"],
    peak_max_df=peak_max_df,
    element_concentrations=element_concentrations
)

print("\nAuto-selected LOOCV summary:")
print(summary_df)

print("\nDetailed predictions for best n_components:")
import IPython.display as disp
disp.display(details_df[["Left-Out Sample", "Training Samples", "True", "Predicted", "Abs Error", "Rel Error (%)"]])

# -------------------------------
# OPTIONAL: MANUAL OVERRIDE
# -------------------------------

# To try your own n_components (e.g. 2), uncomment this:

summary_df, best_n, details_df = perform_plsr_loocv_full_report(
    element="Mn",
    emission_lines=["Mn 403.08 nm", "Mn 403.31 nm"],
    peak_max_df=peak_max_df,
    element_concentrations=element_concentrations,
    fixed_n_components=2
)

print("\nManual override LOOCV summary:")
print(summary_df)
print("\nDetailed predictions for n_components=2:")
disp.display(details_df[["Left-Out Sample", "Training Samples", "True", "Predicted", "Abs Error", "Rel Error (%)"]])

In [None]:
def perform_plsr_loocv_full_report(
    element,
    emission_lines,
    peak_max_df,
    element_concentrations=None,
    assign_sample_colors=None,
    cap_width=0.006,
    return_model=False
):
    import numpy as np
    import pandas as pd
    from sklearn.cross_decomposition import PLSRegression
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from sklearn.model_selection import LeaveOneOut

    if element_concentrations is None or element not in element_concentrations:
        raise ValueError(f"Missing concentration values for element '{element}'.")

    df = peak_max_df[element]
    X = df[emission_lines].values
    y = np.array(element_concentrations[element], dtype=float)
    sample_labels = df.index.tolist()

    n_samples, n_features = X.shape
    max_n = min(n_samples - 1, n_features)

    metrics_summary = []
    details_by_n = {}

    print(f"LOOCV performance for element '{element}' with {max_n} max components:")

    for n in range(1, max_n + 1):
        y_true_cv = []
        y_pred_cv = []
        detailed_records = []

        loo = LeaveOneOut()
        for train_idx, test_idx in loo.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            test_sample = sample_labels[test_idx[0]]
            train_samples = [sample_labels[i] for i in train_idx]

            model = make_pipeline(StandardScaler(), PLSRegression(n_components=n))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)[0, 0]
            y_true = y_test[0]

            y_true_cv.append(y_true)
            y_pred_cv.append(y_pred)

            abs_error = abs(y_true - y_pred)
            rel_error = abs_error / y_true * 100 if y_true != 0 else np.nan

            detailed_records.append({
                "Left-Out Sample": test_sample,
                "Training Samples": train_samples,
                "True": round(y_true, 4),
                "Predicted": round(y_pred, 4),
                "Abs Error": round(abs_error, 4),
                "Rel Error (%)": round(rel_error, 2)
            })

        r2 = r2_score(y_true_cv, y_pred_cv)
        rmse = mean_squared_error(y_true_cv, y_pred_cv, squared=False)
        mae = mean_absolute_error(y_true_cv, y_pred_cv)

        print(f"  n_components={n} | RMSECV={rmse:.4f}, R²={r2:.4f}, MAE={mae:.4f}")

        metrics_summary.append({
            "n_components": n,
            "RMSECV": rmse,
            "R2": r2,
            "MAE": mae
        })

        details_by_n[n] = pd.DataFrame(detailed_records)

    summary_df = pd.DataFrame(metrics_summary)
    best_n = summary_df.loc[summary_df["RMSECV"].idxmin(), "n_components"]
    best_details_df = details_by_n[best_n]

    print(f"\nBest n_components: {best_n} with RMSECV = {summary_df.loc[summary_df['n_components'] == best_n, 'RMSECV'].values[0]:.4f}")
    print(f"\n✅ The best number of components is: {best_n}")  # <--- Added line

    # Optionally return model trained on full data with best_n
    final_model = None
    if return_model:
        final_model = make_pipeline(StandardScaler(), PLSRegression(n_components=best_n))
        final_model.fit(X, y)

    if return_model:
        return summary_df, best_n, best_details_df, final_model
    else:
        return summary_df, best_n, best_details_df

summary_df, best_n, details_df = perform_plsr_loocv_full_report(
    element="Zn",
    emission_lines=["Zn 468.01 nm", "Zn 472.22 nm", "Zn 481.05 nm"],
    peak_max_df=peak_max_df,
    element_concentrations=element_concentrations
)

summary_df
details_df[["Left-Out Sample", "Training Samples", "True", "Predicted", "Abs Error", "Rel Error (%)"]]
