## **PORE SIZE DISTRIBUTIONS**

In [45]:
# importing libraries for data analysis

import os
import re  
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

In [None]:
# setting up folder paths for the analysis
import os
from pathlib import Path
notebook_dir = Path.cwd()
proj_root = notebook_dir.parent
BASE_DIR = str(proj_root / "CRYO-SEM DATA" / "CRYO-SEM X30000")
# where the data files are stored
pore_size_folder = BASE_DIR / "CRYO-SEM X30000 [1]" / "CRYO-SEM X30000 [1]" / "STATS" / "PORE SIZE RESULTS"
bubble_analysis_folder = BASE_DIR / "CRYO-SEM X30000 [1]" / "CRYO-SEM X30000 [1] BA INDIVIDUAL STATS" / "Diameters"

# list of all input folders
data_folders = [pore_size_folder, bubble_analysis_folder]
org_dir = str(proj_root)

# where to save the output files
output_pore_folder = org_dir / "PORE SIZE DISTRIBUTIONS" / "ANALYSED PRIOR"
output_bubble_folder = org_dir / "PORE SIZE DISTRIBUTIONS" / "BUBBLE ANALYSIS"

In [49]:
# setting up colors for different analysis methods

# lighter colors for some chart types
light_colors = {"blue": "#8ecae6", "purple": "#cdb4db", "red": "#f08080"}

# darker colors for other chart types  
dark_colors = {"blue": "#1f77b4", "purple": "#6b21a8", "red": "#b91c1c"}

# grouping different analysis methods by color
method_groups = {
    "blue": ["FREEHAND", "OVAL"],
    "purple": ["ILASTIK", "SEMI", "SAMJ", "60%", "60P"], 
    "red": ["PLANKSTER", "OTSU", "UNET", "PORED2"]
}

In [51]:
# functions to help organize and identify different data types

def check_if_bubble_analysis(folder_path):
    """Check if this folder contains bubble analysis data"""
    return "BA INDIVIDUAL STATS" in folder_path.upper()

def check_if_pore_size_data(folder_path):
    """Check if this folder has pore size analysis results"""
    folder_upper = folder_path.upper()
    if "STATS\\MATLAB STATS" in folder_upper:
        return True
    if "STATS/MATLAB STATS" in folder_upper:
        return True
    if "PORE SIZE RESULTS" in folder_upper:
        return True
    return False

def figure_out_analysis_method(file_path):
    """Try to determine what analysis method was used from the filename"""
    # get just the filename without the full path
    just_filename = os.path.basename(file_path).upper()
    
    # check each group of methods to see if any match
    for color_group in method_groups.values():
        for method_name in color_group:
            if method_name in just_filename:
                # fix the percentage naming issue
                if method_name == "60P":
                    return "60%"
                else:
                    return method_name
    
    # if no match found, just use the first part of the filename
    filename_without_extension = os.path.splitext(os.path.basename(file_path))[0]
    filename_parts = re.split('[_\\-\\s]+', filename_without_extension)
    return filename_parts[0]

def get_color_for_method(method_name, use_dark_color=False):
    """Get the right color for this analysis method"""
    method_upper = method_name.upper()
    
    # look through each color group to find where this method belongs
    for color_name, method_list in method_groups.items():
        for method in method_list:
            if method in method_upper:
                if use_dark_color:
                    return dark_colors[color_name]
                else:
                    return light_colors[color_name]
    
    # default to purple if method not found
    if use_dark_color:
        return dark_colors["purple"]
    else:
        return light_colors["purple"]

In [53]:
# functions to read CSV files and handle histogram data

def convert_histogram_to_individual_values(data_frame):
    """Take histogram data and expand it into individual sample points"""
    # get the diameter values from first column
    diameter_values = pd.to_numeric(data_frame.iloc[:,0], errors="coerce").to_numpy()
    
    # get the probability or count values from second column
    probability_values = pd.to_numeric(data_frame.iloc[:,1], errors="coerce").fillna(0).to_numpy()
    
    # check if these look like probabilities (sum close to 1)
    total_probability = probability_values.sum()
    if total_probability > 0.5 and total_probability < 2.0:
        # normalize to make sure they sum to 1
        probability_values = probability_values / total_probability
    
    # convert to counts for sampling (use 10000 total samples)
    total_samples = 10000
    sample_counts = np.round(probability_values * total_samples).astype(int)
    
    # create individual samples by repeating each diameter value
    individual_samples = np.repeat(diameter_values, sample_counts)
    
    return individual_samples

def read_csv_file_safely(file_path):
    """Try different ways to read a CSV file until one works"""
    # try the normal way first
    try:
        return pd.read_csv(file_path)
    except Exception:
        pass
    
    # try with automatic separator detection
    try:
        return pd.read_csv(file_path, sep=None, engine="python")
    except Exception:
        pass
    
    # try with different encoding
    try:
        return pd.read_csv(file_path, encoding="latin-1")
    except Exception:
        pass
    
    # try both different separator and encoding
    try:
        return pd.read_csv(file_path, sep=None, engine="python", encoding="latin-1")
    except Exception:
        pass
    
    # if nothing worked, give up
    raise RuntimeError("Could not read CSV file: " + file_path)

In [55]:
def load_pore_sizes_from_file(file_path):
    """Read a CSV file and extract pore size values, figure out if units are um or nm"""
    
    # try to read the CSV file
    data_frame = read_csv_file_safely(file_path)
    pore_values = None
    
    # first check if this looks like histogram data (2 columns with probabilities)
    if data_frame.shape[1] == 2:
        try:
            # check if second column sums to around 1 (probabilities)
            second_column_sum = pd.to_numeric(data_frame.iloc[:,1], errors="coerce").fillna(0).sum()
            if second_column_sum >= 0.5 and second_column_sum <= 1.5:
                # treat as histogram and expand to individual values
                pore_values = convert_histogram_to_individual_values(data_frame)
        except Exception:
            # if histogram conversion fails, try other methods
            pass
    
    # if histogram approach didn't work, look for diameter columns
    if pore_values is None:
        for column_name in data_frame.columns:
            column_lower = str(column_name).lower()
            
            # check if this column name suggests it contains diameter data
            diameter_keywords = ["diam", "d_um", "d (um)", "d (Âµm)", "d_nm", "d (nm)", "aecd"]
            contains_diameter_keyword = False
            for keyword in diameter_keywords:
                if keyword in column_lower:
                    contains_diameter_keyword = True
                    break
            
            if contains_diameter_keyword:
                # try to extract numeric values from this column
                numeric_values = pd.to_numeric(data_frame[column_name], errors="coerce").dropna().to_numpy()
                if len(numeric_values) > 0:
                    pore_values = numeric_values
                    break
    
    # if still no values found, just use the first column
    if pore_values is None:
        pore_values = pd.to_numeric(data_frame.iloc[:,0], errors="coerce").dropna().to_numpy()
    
    # if no valid data found, return empty array
    if len(pore_values) == 0:
        return pore_values, "um"
    
    # figure out units by looking at the 95th percentile value
    percentile_95 = np.nanpercentile(pore_values, 95)
    
    if percentile_95 < 10:
        units = "um"
    else:
        units = "nm"
    
    # special case for very small values
    if percentile_95 < 0.02:
        units = "um"
    
    return pore_values, units

def convert_to_nanometers(values, current_units):
    """Convert pore size values to nanometers"""
    if current_units.lower() == "um":
        # multiply by 1000 to convert micrometers to nanometers
        return values * 1000.0
    else:
        # already in nanometers, just make sure it's float
        return values.astype(float, copy=False)

def make_axis_numbers_plain(plot_axis):
    """Make the axis show regular numbers instead of scientific notation"""
    plot_axis.ticklabel_format(style='plain', axis='x', useOffset=False)
    
    number_formatter = ScalarFormatter(useOffset=False, useMathText=False)
    number_formatter.set_scientific(False)
    plot_axis.xaxis.set_major_formatter(number_formatter)

In [57]:
# functions to create histogram bins for different analysis types

def create_bins_for_pore_analysis(data_values, padding_fraction=0.20, 
                                 target_min_bins=20, target_max_bins=45,
                                 minimum_bin_width=3.0, maximum_total_bins=60):
    """Create bins for pore size analysis - not too many bins, around 20-45 total"""
    
    # clean up the data and remove any invalid values
    clean_data = np.asarray(data_values)
    clean_data = clean_data[np.isfinite(clean_data)]
    
    # find the range of the data
    data_min = float(clean_data.min())
    data_max = float(clean_data.max())
    data_range = max(1e-9, data_max - data_min)  # avoid division by zero
    
    # add some padding around the data range
    padding_amount = data_range * padding_fraction
    bin_start = max(0.0, data_min - padding_amount)  # don't go below zero
    bin_end = data_max + padding_amount
    total_span = bin_end - bin_start
    
    # calculate how many bins we want based on data size
    target_bins = int(round(np.sqrt(len(clean_data)) * 3.0))
    # make sure it's within our desired range
    if target_bins < target_min_bins:
        target_bins = target_min_bins
    if target_bins > target_max_bins:
        target_bins = target_max_bins
    
    # figure out bin width, respecting minimum width
    suggested_width = total_span / target_bins
    max_width_allowed = total_span / maximum_total_bins
    actual_bin_width = max(minimum_bin_width, max_width_allowed, suggested_width)
    
    # calculate final number of bins
    final_bin_count = int(np.ceil(total_span / actual_bin_width))
    if final_bin_count < target_min_bins:
        final_bin_count = target_min_bins
    if final_bin_count > maximum_total_bins:
        final_bin_count = maximum_total_bins
    
    # create the actual bin edges
    bin_edges = np.linspace(bin_start, bin_end, final_bin_count + 1)
    
    return bin_edges, (bin_start, bin_end)

def create_bins_for_bubble_analysis(data_values, padding_fraction=0.20,
                                   target_bins=100, min_allowed_bins=70, 
                                   max_allowed_bins=120, minimum_bin_width=1.5):
    """Create bins for bubble analysis - more detailed, around 70-120 bins"""
    
    # clean up the data and remove any invalid values
    clean_data = np.asarray(data_values)
    clean_data = clean_data[np.isfinite(clean_data)]
    
    # find the range of the data
    data_min = float(clean_data.min())
    data_max = float(clean_data.max())
    data_range = max(1e-9, data_max - data_min)  # avoid division by zero
    
    # add some padding around the data range
    padding_amount = data_range * padding_fraction
    bin_start = max(0.0, data_min - padding_amount)  # don't go below zero
    bin_end = data_max + padding_amount
    total_span = bin_end - bin_start
    
    # calculate bin width based on target number of bins
    suggested_bin_width = total_span / target_bins
    actual_bin_width = max(minimum_bin_width, suggested_bin_width)
    
    # calculate final number of bins
    final_bin_count = int(np.ceil(total_span / actual_bin_width))
    if final_bin_count < min_allowed_bins:
        final_bin_count = min_allowed_bins
    if final_bin_count > max_allowed_bins:
        final_bin_count = max_allowed_bins
    
    # create the actual bin edges
    bin_edges = np.linspace(bin_start, bin_end, final_bin_count + 1)
    
    return bin_edges, (bin_start, bin_end)

In [59]:
def create_histogram_plot(diameter_values_nm, plot_title, bar_color, save_path, analysis_type):
    """Create and save a histogram plot of pore size data"""
    
    # clean up the data and remove any invalid values
    clean_values = np.asarray(diameter_values_nm)
    clean_values = clean_values[np.isfinite(clean_values)]
    
    # if no valid data, don't make a plot
    if len(clean_values) == 0:
        return
    
    # choose the right binning method based on analysis type
    if analysis_type == "BA":
        bin_edges, (x_min, x_max) = create_bins_for_bubble_analysis(clean_values)
    else:
        # default to pore analysis binning
        bin_edges, (x_min, x_max) = create_bins_for_pore_analysis(clean_values)
    
    # calculate histogram to find the maximum count for y-axis scaling
    bin_counts, temp_edges = np.histogram(clean_values, bins=bin_edges)
    max_count = max(1, bin_counts.max())
    
    # add some extra space at the top (30% headroom)
    y_padding = max(1, int(np.ceil(max_count * 0.30)))
    
    # create the plot
    figure, axis = plt.subplots(figsize=(6.8, 4.8))
    
    # make the histogram
    axis.hist(clean_values, bins=bin_edges, color=bar_color, 
              edgecolor="black", linewidth=0.6, alpha=0.85)
    
    # add labels and title
    axis.set_xlabel("Pore Size (nm)")
    axis.set_ylabel("Frequency")
    
    # create title with sample count
    full_title = plot_title + " (n = " + str(len(clean_values)) + ")"
    axis.set_title(full_title)
    
    # make the numbers on x-axis look normal (not scientific notation)
    make_axis_numbers_plain(axis)
    
    # set the axis limits
    axis.set_xlim(x_min, x_max)
    axis.set_ylim(0, max_count + y_padding)
    
    # make it look nice and save
    figure.tight_layout()
    figure.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close(figure)

In [61]:
# functions to clean up filenames and find CSV files

def remove_hidden_characters(text_string):
    """Clean up text by removing hidden unicode characters"""
    # normalize the unicode characters
    normalized_text = unicodedata.normalize('NFKC', text_string)
    
    # keep only visible characters (remove control and separator characters)
    clean_characters = []
    for character in normalized_text:
        char_category = unicodedata.category(character)
        # skip control characters and line/paragraph separators
        if not char_category.startswith('C') and not char_category.startswith('Zl') and not char_category.startswith('Zp'):
            clean_characters.append(character)
    
    # join back together and remove extra whitespace
    clean_text = ''.join(clean_characters)
    return clean_text.strip()

def find_csv_files_in_folder(folder_path):
    """Look through a folder and find all CSV files, handling weird filenames"""
    csv_files_found = []
    
    # try to get the list of files in the folder
    try:
        file_names = os.listdir(folder_path)
    except Exception as error:
        print("Warning: Could not list files in " + folder_path + ": " + str(error))
        return csv_files_found
    
    # check each file to see if it's a CSV
    for file_name in file_names:
        # skip system files
        if file_name.lower() == "desktop.ini":
            continue
        
        # clean up any weird characters in the filename
        clean_filename = remove_hidden_characters(file_name)
        
        # check if this looks like a CSV file
        if clean_filename.lower().endswith(".csv"):
            full_file_path = os.path.join(folder_path, file_name)
            csv_files_found.append(full_file_path)
            continue
        
        # also check for files that start with .csv (in case of weird extensions)
        filename_without_extension, file_extension = os.path.splitext(file_name)
        if file_extension.lower().startswith(".csv"):
            full_file_path = os.path.join(folder_path, file_name)
            csv_files_found.append(full_file_path)
    
    return csv_files_found

In [63]:
# process all the data folders and create histogram plots

saved_plot_files = []

for data_folder in data_folders:
    # clean up the folder path
    clean_folder_path = os.path.normpath(data_folder.strip().rstrip("."))
    
    # check if this folder actually exists
    if not os.path.isdir(clean_folder_path):
        print("Skipping - not a valid folder: " + clean_folder_path)
        continue
    
    # figure out what type of analysis this is
    if check_if_bubble_analysis(clean_folder_path):
        analysis_mode = "BA"
    elif check_if_pore_size_data(clean_folder_path):
        analysis_mode = "AP"
    else:
        analysis_mode = "AP"  # default to pore analysis
    
    print("\n[" + analysis_mode + "] Looking for files in: " + clean_folder_path)
    
    # show what files are in this folder
    try:
        folder_contents = os.listdir(clean_folder_path)
        for file_name in folder_contents:
            print("   - " + repr(file_name))
    except Exception as error:
        print("   Warning: Could not list folder contents: " + str(error))
    
    # find all the CSV files in this folder
    csv_file_list = find_csv_files_in_folder(clean_folder_path)
    if len(csv_file_list) == 0:
        print("Info: No CSV files found in this folder")
        continue
    
    # figure out where to save the output plots
    if analysis_mode == "BA":
        output_folder = output_bubble_folder
    else:
        output_folder = output_pore_folder
    
    # make sure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # create a label for the plots based on the folder name
    folder_basename = os.path.basename(clean_folder_path)
    parent_folder_name = os.path.basename(os.path.dirname(clean_folder_path))
    
    # use parent folder name if current folder is a generic stats folder
    generic_folder_names = ["STATS", "DIAMETERS", "MATLAB STATS", "PORE SIZE RESULTS"]
    if folder_basename.upper() in generic_folder_names:
        base_label = parent_folder_name
    else:
        base_label = folder_basename
    
    plot_label_prefix = base_label + " [" + analysis_mode + "]"
    
    print("\nProcessing " + str(len(csv_file_list)) + " file(s) -> Output folder: " + output_folder)
    
    # process each CSV file
    for csv_file_path in csv_file_list:
        try:
            # load the pore size data from this file
            pore_values, original_units = load_pore_sizes_from_file(csv_file_path)
            
            # convert everything to nanometers
            diameter_nm = convert_to_nanometers(pore_values, original_units)
            
            # figure out what analysis method was used
            analysis_method = figure_out_analysis_method(csv_file_path)
            
            # pick the right color for this method
            use_dark_colors = (analysis_mode == "BA")
            plot_color = get_color_for_method(analysis_method, use_dark_colors)
            
            # create the plot title
            full_plot_title = plot_label_prefix + " " + analysis_method
            
            # create the output filename
            output_filename = analysis_mode + "_" + analysis_method + "_hist_nm.png"
            output_file_path = os.path.join(output_folder, output_filename)
            
            # create and save the histogram plot
            create_histogram_plot(diameter_nm, full_plot_title, plot_color, output_file_path, analysis_mode)
            
            # keep track of what we saved
            saved_plot_files.append(output_file_path)
            
            print("  Success: " + os.path.basename(csv_file_path) + " -> " + os.path.basename(output_file_path) + " (n=" + str(len(diameter_nm)) + ")")
            
        except Exception as error:
            print("  Failed: " + os.path.basename(csv_file_path) + ": " + str(error))

print("\nFinished processing. Saved plot files:")
for saved_file in saved_plot_files:
    print(" - " + saved_file)


[AP] Looking for files in: C:\Users\walsh\Downloads\CRYO-SEM Accuracy INTERNAL\CRYO-SEM X30000\CRYO-SEM X30000 [1]\CRYO-SEM X30000 [1] STATS\PORE SIZE RESULTS
   - '60% AECD Results.csv'
   - 'FREEHAND AECD Results.csv'
   - 'GOLD STANDARD [X30000] Results AECD Results.csv'
   - 'ILASTIK [X30000] Results AECD Results.csv'
   - 'OTSU AECD Results.csv'
   - 'OVAL [X30000] Results AECD Results.csv'
   - 'PLANKSTER [X30000] Results AECD Results.csv'
   - 'PORE SIZE DISTRIBUTION'
   - 'PORED2 [X30000] Results AECD Results.csv'
   - 'SAMJ [X30000] Results AECD Results.csv'
   - 'SEMI [X30000] Results AECD Results.csv'
   - 'UNET [X30000] Results AECD Results.csv'

Processing 11 file(s) -> Output folder: C:/Users/walsh/Documents/GitHub/AGAROSE-HYDROGEL-TRENDS-USING-AI-ML/PORE SIZE DISTRIBUTIONS/ANALYSED PRIOR
  Success: 60% AECD Results.csv -> AP_60%_hist_nm.png (n=55)
  Success: FREEHAND AECD Results.csv -> AP_FREEHAND_hist_nm.png (n=59)
  Success: GOLD STANDARD [X30000] Results AECD Result