# Statistics Tutorial Multiple Files

This notebook demonstrates how to load, preprocess, locate, and extract statistics from nanoindentation data using the merrypopins library.
For processing multiple data files.

### Required imports

In [None]:
from merrypopins.load_datasets import load_txt
from merrypopins.preprocess import default_preprocess
from merrypopins.locate import default_locate
from merrypopins.statistics import (
    default_statistics_stress_strain,
)
import pandas as pd
from pathlib import Path
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set folder containing .txt files
data_dir = Path("datasets/6microntip_slowloading")  # <-- adjust if needed
txt_files = list(data_dir.glob("*.txt"))
print(f"Found {len(txt_files)} .txt files.")

### Load, preprocess, locate and extract statistics. 

In [None]:
# List to store all per-file stress–strain statistics (includes load–depth stats too)
all_stats = []

for file in txt_files:
    print(f"Processing: {file.name}")
    try:
        df_raw = load_txt(file)
        df_clean = default_preprocess(df_raw)
        df_located = default_locate(df_clean, use_cnn=False, use_iforest=False)
        
        # Full statistics (includes both stress–strain and load–depth)
        df_stats = default_statistics_stress_strain(df_located)
        df_stats["source_file"] = file.name
        all_stats.append(df_stats)

    except Exception as e:
        print(f"Error processing {file.name}: {e}")


### Show preview of all statistics

In [None]:
# Combine all processed stress–strain DataFrames (which include load–depth stats)
df_all_stats = pd.concat(all_stats, ignore_index=True)

# Option 1: Full dataset (includes all rows, selected and non-selected)
print("Preview of full statistics dataset (stress–strain + load–depth):")
display(df_all_stats.head())

# Option 2: Filtered dataset — only rows where a pop-in was selected
df_selected = df_all_stats[df_all_stats["popin_selected"] == True].copy()

print("Preview of selected pop-ins only:")
display(df_selected.head())

### Optional: export to csv

In [None]:
export_csv = False

if export_csv:
    # Export full dataset (includes both stress–strain and load–depth stats)
    df_all_stats.to_csv("full_popin_statistics.csv", index=False)

    # Export filtered dataset (pop-in selected only)
    df_selected = df_all_stats[df_all_stats["popin_selected"] == True].copy()
    df_selected.to_csv("selected_popin_statistics.csv", index=False)

    print("Exported full and filtered statistics datasets to CSV.")
else:
    print("Export skipped.")


### Plot load-depth curves with pop-ins (filtered to pop-ins that have a local-max)

In [None]:
n_plots = len(all_stats)
n_cols = 3
n_rows = math.ceil(n_plots / n_cols)

fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4.5 * n_rows))

for ax, df_stats in zip(axs.flatten(), all_stats):
    label = df_stats["source_file"].iloc[0]
    ax.plot(df_stats["Depth (nm)"], df_stats["Load (µN)"], label="Curve")
    ax.scatter(
        df_stats.loc[df_stats["popin_selected"], "Depth (nm)"],
        df_stats.loc[df_stats["popin_selected"], "Load (µN)"],
        color="red", label="Pop-in", zorder=10
    )
    ax.set_title(label, fontsize=10)
    ax.set_xlabel("Depth (nm)")
    ax.set_ylabel("Load (µN)")
    ax.legend(fontsize=8)
    ax.grid(True)

# Turn off any unused subplots
for ax in axs.flatten()[n_plots:]:
    ax.axis('off')

plt.suptitle("Load–Depth Curves with Pop-Ins", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


### Plot stress-strain curves

In [None]:
n_plots = len(all_stats)
n_cols = 3
n_rows = math.ceil(n_plots / n_cols)

fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4.5 * n_rows))

for ax, df_ss in zip(axs.flatten(), all_stats):
    label = df_ss["source_file"].iloc[0]
    ax.plot(df_ss["strain"], df_ss["stress"], label="Curve")
    ax.scatter(
        df_ss.loc[df_ss["popin_selected"], "strain"],
        df_ss.loc[df_ss["popin_selected"], "stress"],
        color="red", label="Pop-in", zorder=10
    )
    ax.set_title(label, fontsize=10)
    ax.set_xlabel("Strain")
    ax.set_ylabel("Stress (MPa)")
    ax.legend(fontsize=8)
    ax.grid(True)

# Turn off unused axes
for ax in axs.flatten()[n_plots:]:
    ax.axis('off')

plt.suptitle("Stress–Strain Curves with Pop-Ins", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

### Plot examples

Popin Frequency.

Average number of pop-ins per indent, grouped by grain. This normalised metric highlights grain-level differences in pop-in frequency.

In [None]:
import matplotlib.pyplot as plt

# Extract grain from file name
df_all_stats["grain"] = df_all_stats["source_file"].str.extract(r"(grain\d+)")

# Total pop-ins per grain
popin_counts = (
    df_all_stats[df_all_stats["popin_selected"]]
    .groupby("grain")["popin_selected"]
    .count()
)

# Total indents per grain -> adjust naming to data
indent_counts = df_all_stats.groupby("grain")["source_file"].nunique()

# Normalised pop-in rate
popins_per_indent = (popin_counts / indent_counts).sort_values()

# Plot
plt.figure(figsize=(8, 5))
popins_per_indent.plot(kind="bar", color="thistle", edgecolor="black")
plt.ylabel("Avg Pop-Ins per Indent")
plt.xlabel("Grain")
plt.title("Average Pop-In Count per Indent by Grain")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


Timing of fist pop-in. 

Mean first pop-in time per grain, with standard deviation. 

In [None]:
# Extract first pop-in per indent 
first_popin_per_indent = (
    df_all_stats[df_all_stats["popin_selected"]]
    .groupby("source_file")["Time (s)"]
    .min()
    .reset_index()
)

# Extract grain again -> adjust naming to data
first_popin_per_indent["grain"] = first_popin_per_indent["source_file"].str.extract(r"(grain\d+)")

# Group by grain
grain_stats = first_popin_per_indent.groupby("grain")["Time (s)"].agg(["mean", "std"]).sort_values("mean")

# Plot
plt.figure(figsize=(8, 5))
plt.bar(grain_stats.index, grain_stats["mean"], yerr=grain_stats["std"], color="lavender", edgecolor="black", capsize=4)
plt.axhline(grain_stats["mean"].mean(), color="gray", linestyle="--", label="Global Mean")
plt.ylabel("First Pop-In Time (s)")
plt.xlabel("Grain")
plt.title("Mean First Pop-In Time per Grain (with sd)")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()