# Python - Simple

### CSV Results Summarizer

#### This notebook cell finds all CSV files matching the pattern `results_*.csv` in the specified directory,
#### extracts the matrix size from the filename, calculates the mean and standard deviation of the
#### 'Time Taken (s)' column for each file, and prints a summary line in the format:
#### `mean1 ± std1 & mean2 ± std2 & ...` sorted by matrix size.

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import re
import warnings
import math
from decimal import Decimal, ROUND_HALF_UP, getcontext
from typing import List, Tuple, Dict, Optional, Set

# --- Helper Function for Formatting ---
# (Keep this function as is, it's complex but self-contained)
def format_mean_std(mean: float, std: float, default_precision: int = 6) -> Tuple[str, str]:
    """
    Formats mean and standard deviation according to scientific rounding rules.

    Args:
        mean (float): The mean value.
        std (float): The standard deviation.
        default_precision (int): Fallback precision if std is zero or non-finite.

    Returns:
        tuple: (formatted_mean_str, formatted_std_str)
    """
    # Ensure inputs are finite numbers
    if not np.isfinite(mean) or not np.isfinite(std):
        warnings.warn(f"Non-finite input detected: mean={mean}, std={std}. Returning 'NaN'.")
        return "NaN", "NaN"

    # Handle std == 0 case: Format mean reasonably, std is exactly 0.
    if std == 0:
        # Determine a reasonable precision for the mean if std is 0
        mean_dec = Decimal(str(mean))
        if mean_dec == mean_dec.to_integral_value(): # Looks like an integer
             mean_fmt = f"{int(mean)}"
             std_fmt = "0"
        elif abs(mean) < 1e-4 and mean != 0: # Small number, use scientific? Or just default f
             mean_fmt = f"{mean:.{default_precision}g}" # Use 'g' for auto sci/fixed
             # Match decimal places if possible, otherwise use '0'
             if 'e' in mean_fmt or 'E' in mean_fmt:
                  std_fmt = "0" # Can't easily match sci notation '0'
             elif '.' in mean_fmt:
                  num_decimals = len(mean_fmt.split('.')[-1])
                  std_fmt = f"{0.0:.{num_decimals}f}"
             else: # Should not happen if not integer, but fallback
                   std_fmt = "0"
        else: # General case or large numbers with decimals
             # Use default precision for mean when std is zero
             mean_fmt = f"{mean:.{default_precision}f}"
             # Ensure std has same number of decimal places if possible
             if '.' in mean_fmt:
                 num_decimals = len(mean_fmt.split('.')[-1])
                 std_fmt = f"{0.0:.{num_decimals}f}"
             else: # Mean formatted as integer or sci notation
                 std_fmt = "0"

        return mean_fmt, std_fmt

    # --- Determine significant figures for std using Decimal ---
    try:
        # Use string conversion for Decimal to avoid float representation issues
        std_dec = Decimal(str(std))
        if std_dec <= 0: # Should handle std==0 above, this catches negative std?
             raise ValueError("Standard deviation must be positive.")

        # Find the exponent of the first significant digit (most significant digit's position)
        exponent = std_dec.adjusted()

        # Find the first significant digit's value
        sci_notation = f"{std_dec:e}" # e.g., '1.2345e-3' or '1.2345e+2'
        first_digit_char = sci_notation.lstrip('-')[0] # Handle negative numbers (though std shouldn't be)
        if not first_digit_char.isdigit():
             raise ValueError(f"Could not extract first digit from std: {std}")
        first_digit = int(first_digit_char)

        # Determine number of significant figures (k) for std
        k = 2 if first_digit == 1 else 1

        # --- Round std to k significant figures ---
        # Position of the least significant digit needed for std
        # math.floor(math.log10(abs(std))) gives the exponent of the leading digit
        std_exponent = math.floor(math.log10(abs(std))) # Position of leading digit power of 10

        # The decimal place *index* to round to is (std_exponent - (k - 1))
        rounding_decimal_place = std_exponent - (k - 1)

        # Use Decimal quantize for rounding std
        quantizer_std = Decimal('1e' + str(rounding_decimal_place))
        rounded_std_dec = std_dec.quantize(quantizer_std, rounding=ROUND_HALF_UP)

        # --- Determine decimal places for mean ---
        # The mean should be rounded to the same decimal place as the rounded std.
        decimal_places = max(0, -rounding_decimal_place)

        # --- Round mean to the same decimal place ---
        mean_dec = Decimal(str(mean))
        if decimal_places > 0:
            mean_quantizer = Decimal('1e-' + str(decimal_places))
        else:
            # Round to the nearest integer (or 10, 100 etc. if std was large)
            mean_quantizer = quantizer_std # Use the same quantizer as std

        rounded_mean_dec = mean_dec.quantize(mean_quantizer, rounding=ROUND_HALF_UP)

        # --- Format as strings ---
        # Use '.Xf' format specifier for consistent decimal places
        mean_fmt = f"{rounded_mean_dec:.{decimal_places}f}"
        std_fmt = f"{rounded_std_dec:.{decimal_places}f}"

        return mean_fmt, std_fmt

    except ValueError as ve: # Catch specific errors like negative std or digit extraction issue
        warnings.warn(f"Value error during dynamic rounding for mean={mean}, std={std}: {ve}. Falling back to default precision.")
        mean_fmt = f"{mean:.{default_precision}g}"
        std_fmt = f"{std:.{default_precision}g}"
        return mean_fmt, std_fmt
    except Exception as e: # General catch-all
        warnings.warn(f"Unexpected error during dynamic rounding for mean={mean}, std={std}: {type(e).__name__} - {e}. Falling back to default precision.")
        mean_fmt = f"{mean:.{default_precision}g}"
        std_fmt = f"{std:.{default_precision}g}"
        return mean_fmt, std_fmt


# --- Main Processing Function ---
def process_and_format_results(
    results_folder: str,
    file_pattern: str = 'results_*_[0-9]*.csv',
    time_column_name: str = 'Time Taken (s)',
    include_sizes: Optional[Set[int]] = None,
    min_size_to_include: Optional[int] = 256, # Based on your original filtering logic
    default_output_precision: int = 6,
    output_separator: str = " & ",
    verbose: bool = True,
    print_summary: bool = True,
    print_output: bool = True
) -> Tuple[str, Dict[int, Tuple[float, float]], List[str], List[str]]:
    """
    Processes CSV result files, calculates mean/std, formats them, and returns results.

    Args:
        results_folder (str): Path to the directory containing the CSV files.
        file_pattern (str): Glob pattern to match result files. Should contain a
                            group to capture the matrix size number (e.g., '_(\d+)\.csv$').
        time_column_name (str): Name of the column containing time data in the CSVs.
        include_sizes (Optional[Set[int]]): A set of specific matrix sizes to include.
                                             If None, size filtering is based only on
                                             min_size_to_include.
        min_size_to_include (Optional[int]): Minimum matrix size to include in the
                                             final output. If None, no minimum size filter.
                                             Defaults to 256 based on original script logic.
        default_output_precision (int): Fallback precision for format_mean_std.
        output_separator (str): Separator used to join the formatted results for the final string.
        verbose (bool): If True, print detailed processing steps and warnings.
        print_summary (bool): If True, print the processing summary (counts of files).
        print_output (bool): If True, print the final formatted output string.

    Returns:
        Tuple containing:
        - str: The final formatted output string (e.g., for LaTeX).
        - Dict[int, Tuple[float, float]]: Dictionary mapping matrix size to (raw mean, raw std).
        - List[str]: List of base filenames successfully processed.
        - List[str]: List of base filenames skipped (with reason).
    """
    # Set Decimal context precision (optional, but good practice if needed)
    # getcontext().prec = 28 # Default is usually sufficient

    results: Dict[int, Tuple[float, float]] = {}
    processed_files: List[str] = []
    skipped_files: List[str] = []

    # Construct the full search path
    search_path = os.path.join(results_folder, file_pattern)

    # Find all matching files
    try:
        csv_files = glob.glob(search_path)
    except Exception as e:
         print(f"Error during file search with pattern '{search_path}': {e}")
         return "", {}, [], []

    if not csv_files:
        print(f"Warning: No files found matching the pattern '{search_path}' in directory '{os.path.abspath(results_folder)}'")
        return "", {}, [], []

    if verbose:
        print(f"Found {len(csv_files)} potential result files matching '{file_pattern}' in '{results_folder}'...")

    # Compile regex for efficiency if pattern is complex (simple here)
    size_regex = re.compile(r'_(\d+)\.csv$') # Assumes size is numbers before .csv

    for filepath in csv_files:
        base_filename = os.path.basename(filepath)
        if verbose:
            print(f"Processing '{base_filename}'...")

        match = size_regex.search(base_filename)
        if not match:
            if verbose:
                print(f"  Warning: Could not extract matrix size number from filename '{base_filename}'. Skipping.")
            skipped_files.append(f"{base_filename} (no size match)")
            continue

        try:
            matrix_size = int(match.group(1))
            # --- Filtering Logic ---
            if include_sizes is not None and matrix_size not in include_sizes:
                 if verbose:
                     print(f"  Info: Skipping size {matrix_size} as it's not in the include_sizes set.")
                 skipped_files.append(f"{base_filename} (size {matrix_size} not in include_sizes)")
                 continue
            if min_size_to_include is not None and matrix_size < min_size_to_include:
                if verbose:
                    print(f"  Info: Skipping size {matrix_size} as it's below min_size_to_include ({min_size_to_include}).")
                skipped_files.append(f"{base_filename} (size {matrix_size} < {min_size_to_include})")
                continue
            # --- End Filtering ---

        except ValueError:
            if verbose:
                print(f"  Warning: Extracted size '{match.group(1)}' is not a valid integer in filename '{base_filename}'. Skipping.")
            skipped_files.append(f"{base_filename} (invalid size '{match.group(1)}')")
            continue

        try:
            df = pd.read_csv(filepath)

            # --- Data Validation ---
            if df.empty:
                if verbose:
                    print(f"  Warning: File '{base_filename}' is empty. Skipping.")
                skipped_files.append(f"{base_filename} (empty file)")
                continue
            if time_column_name not in df.columns:
                if verbose:
                    print(f"  Error: Column '{time_column_name}' not found in '{base_filename}'. Skipping.")
                skipped_files.append(f"{base_filename} (missing column '{time_column_name}')")
                continue

            # --- Calculation ---
            times = pd.to_numeric(df[time_column_name], errors='coerce')
            num_original = len(times)
            num_null_before = times.isnull().sum()

            if num_null_before == num_original:
                 if verbose:
                     print(f"  Error: Column '{time_column_name}' in '{base_filename}' contains no valid numeric data. Skipping.")
                 skipped_files.append(f"{base_filename} (no numeric data in column)")
                 continue
            elif num_null_before > 0:
                 if verbose:
                     print(f"  Warning: Found and ignored {num_null_before} non-numeric value(s) in '{time_column_name}' column of '{base_filename}'.")

            times = times.dropna() # Remove NaNs coerced from non-numeric values
            if times.empty:
                if verbose:
                    print(f"  Warning: After handling non-numeric values, no valid time data remains in '{base_filename}'. Skipping.")
                skipped_files.append(f"{base_filename} (no valid data after dropna)")
                continue

            mean_time = times.mean()
            # Calculate std dev. Use ddof=1 (sample std dev).
            # Handle cases with 0 or 1 data point where std dev is undefined or zero.
            if len(times) > 1:
                 std_dev_time = times.std(ddof=1)
                 if pd.isna(std_dev_time): # Should only happen if input had NaNs not caught? Or weird data.
                    std_dev_time = 0.0
                    if verbose:
                        print(f"  Info: Std deviation calculated as NaN for size {matrix_size}. Setting to 0.0.")
            else: # len(times) == 1
                 std_dev_time = 0.0 # Std dev of a single point is 0
                 if verbose:
                     print(f"  Info: Only one data point found for size {matrix_size}. Standard deviation set to 0.0.")


            # --- Store Raw Results ---
            if matrix_size in results:
                 if verbose:
                     print(f"  Warning: Duplicate matrix size {matrix_size} found (from '{base_filename}'). Overwriting previous result.")

            results[matrix_size] = (float(mean_time), float(std_dev_time)) # Ensure storing as float
            processed_files.append(base_filename)

        except pd.errors.EmptyDataError:
            if verbose:
                print(f"  Warning: File '{base_filename}' is empty or contains only headers (pandas EmptyDataError). Skipping.")
            skipped_files.append(f"{base_filename} (pandas EmptyDataError)")
        except FileNotFoundError:
             # This shouldn't happen inside the loop as glob found it, but just in case
             print(f"  Error: File '{filepath}' vanished before processing? Skipping.")
             skipped_files.append(f"{base_filename} (FileNotFoundError during processing)")
        except Exception as e:
            print(f"  Error processing file '{base_filename}': {type(e).__name__} - {e}. Skipping.")
            skipped_files.append(f"{base_filename} (processing error: {type(e).__name__})")

    # --- Final Output Generation ---
    if print_summary:
        print("\n--- Processing Summary ---")
        print(f"Searched in: {os.path.abspath(results_folder)}")
        print(f"Using pattern: {file_pattern}")
        print(f"Attempted to process {len(csv_files)} files.")
        print(f"Successfully processed data for {len(results)} unique matrix sizes from {len(processed_files)} files.")
        if skipped_files:
            print(f"Skipped {len(skipped_files)} files/entries.")
            if verbose or len(skipped_files) < 10:
                 print("  Skipped items list:", skipped_files)
            else:
                 print(f"  Example skipped items: {skipped_files[:5]}... (list truncated)")
        print("--- End Summary ---")


    final_output = ""
    if not results:
        print("\nNo valid data was processed. Cannot generate output line.")
    else:
        # Sort results by matrix size
        sorted_sizes = sorted(results.keys())

        # Format the output strings using the dynamic rounding function
        output_parts = []
        if verbose:
            print("\n--- Applying Scientific Dynamic Rounding (std determines mean precision) ---")

        for size in sorted_sizes:
            # Filtering based on min_size_to_include is already done during file processing loop
            mean, std = results[size]
            try:
                # Apply the dynamic rounding function
                formatted_mean, formatted_std = format_mean_std(mean, std, default_output_precision)
                # Format for LaTeX output: $mean \pm std$
                formatted_result = f"${formatted_mean} \\pm {formatted_std}$"
                output_parts.append(formatted_result)
                if verbose:
                    print(f"  Size {size}: Raw=({mean:.6g} ± {std:.6g}) -> Formatted={formatted_result}")
            except Exception as e:
                 print(f"  Error formatting result for size {size} (Mean={mean}, Std={std}): {type(e).__name__} - {e}. Adding placeholder.")
                 # Add a placeholder or skip
                 output_parts.append(f"$ERR \\pm ERR$") # Placeholder for error

        # Join the parts into the final output line
        final_output = output_separator.join(output_parts)

        if print_output:
            print("\n--- Results ---")
            # Display sizes actually included in the final string
            included_sizes_in_output = sorted_sizes # Since filtering happened before this loop
            print("Matrix Sizes Included in Output:", included_sizes_in_output)
            print(f"\nFinal Output Line (using separator '{output_separator}'):")
            print(final_output)

    # Return the formatted string, raw data, and file lists
    return final_output, results, processed_files, skipped_files

  group to capture the matrix size number (e.g., '_(\d+)\.csv$').


In [3]:
results_dir1 = '../python/basic'
results_dir2 = '../python/numpy'
results_dir3 = '../C/DGEMM_basic_O0'
results_dir4 = '../C/DGEMM_basic_O1'
results_dir5 = '../C/DGEMM_basic_O2'
results_dir6 = '../C/DGEMM_basic_O3'


# --- Basic Call (using most defaults) ---
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir1, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir2, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir3, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir4, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir4, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir5, verbose=False)
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir6, verbose=False)



--- Processing Summary ---
Searched in: c:\Duds\DuduClean\ArquivosVScodePastaManual\UFRJ\periodo3\ArqComp\Pratica\DGEMM\python\basic
Using pattern: results_*_[0-9]*.csv
Attempted to process 13 files.
Successfully processed data for 6 unique matrix sizes from 6 files.
Skipped 7 files/entries.
  Skipped items list: ['results_python_128.csv (size 128 < 256)', 'results_python_16.csv (size 16 < 256)', 'results_python_2.csv (size 2 < 256)', 'results_python_32.csv (size 32 < 256)', 'results_python_4.csv (size 4 < 256)', 'results_python_64.csv (size 64 < 256)', 'results_python_8.csv (size 8 < 256)']
--- End Summary ---

--- Results ---
Matrix Sizes Included in Output: [256, 512, 1024, 2048, 4096, 8192]

Final Output Line (using separator ' & '):
$3.69 \pm 0.05$ & $30.2 \pm 0.4$ & $248 \pm 3$ & $2350 \pm 30$ & $19710 \pm 40$ & $173900 \pm 900$

--- Processing Summary ---
Searched in: c:\Duds\DuduClean\ArquivosVScodePastaManual\UFRJ\periodo3\ArqComp\Pratica\DGEMM\python\numpy
Using pattern: res

In [4]:
results_dir = '../C/DGEMM_SIMD'


# --- Basic Call (using most defaults) ---
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir, verbose=False)


--- Processing Summary ---
Searched in: c:\Duds\DuduClean\ArquivosVScodePastaManual\UFRJ\periodo3\ArqComp\Pratica\DGEMM\C\DGEMM_SIMD
Using pattern: results_*_[0-9]*.csv
Attempted to process 12 files.
Successfully processed data for 6 unique matrix sizes from 6 files.
Skipped 6 files/entries.
  Skipped items list: ['results_c_128.csv (size 128 < 256)', 'results_c_16.csv (size 16 < 256)', 'results_c_32.csv (size 32 < 256)', 'results_c_4.csv (size 4 < 256)', 'results_c_64.csv (size 64 < 256)', 'results_c_8.csv (size 8 < 256)']
--- End Summary ---

--- Results ---
Matrix Sizes Included in Output: [256, 512, 1024, 2048, 4096, 8192]

Final Output Line (using separator ' & '):
$0.00218 \pm 0.00003$ & $0.034 \pm 0.002$ & $0.55 \pm 0.03$ & $6.5 \pm 1.4$ & $175.5 \pm 0.3$ & $1757 \pm 2$


In [5]:
results_dir = '../C/DGEMM_unrolling'
# --- Basic Call (using most defaults) ---
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir, verbose=False)


--- Processing Summary ---
Searched in: c:\Duds\DuduClean\ArquivosVScodePastaManual\UFRJ\periodo3\ArqComp\Pratica\DGEMM\C\DGEMM_unrolling
Using pattern: results_*_[0-9]*.csv
Attempted to process 10 files.
Successfully processed data for 6 unique matrix sizes from 6 files.
Skipped 4 files/entries.
  Skipped items list: ['results_c_128.csv (size 128 < 256)', 'results_c_16.csv (size 16 < 256)', 'results_c_32.csv (size 32 < 256)', 'results_c_64.csv (size 64 < 256)']
--- End Summary ---

--- Results ---
Matrix Sizes Included in Output: [256, 512, 1024, 2048, 4096, 8192]

Final Output Line (using separator ' & '):
$0.00129 \pm 0.00004$ & $0.0127 \pm 0.0009$ & $0.134 \pm 0.013$ & $2.41 \pm 0.17$ & $49.21 \pm 0.09$ & $431.6 \pm 0.2$


In [6]:
results_dir = '../C/DGEMM_blocking'
# --- Basic Call (using most defaults) ---
latex_string, raw_data, processed, skipped = process_and_format_results(results_dir, verbose=False)


--- Processing Summary ---
Searched in: c:\Duds\DuduClean\ArquivosVScodePastaManual\UFRJ\periodo3\ArqComp\Pratica\DGEMM\C\DGEMM_blocking
Using pattern: results_*_[0-9]*.csv
Attempted to process 9 files.
Successfully processed data for 6 unique matrix sizes from 6 files.
Skipped 3 files/entries.
  Skipped items list: ['results_c_128.csv (size 128 < 256)', 'results_c_32.csv (size 32 < 256)', 'results_c_64.csv (size 64 < 256)']
--- End Summary ---

--- Results ---
Matrix Sizes Included in Output: [256, 512, 1024, 2048, 4096, 8192]

Final Output Line (using separator ' & '):
$0.00108 \pm 0.00006$ & $0.0093 \pm 0.0002$ & $0.076 \pm 0.003$ & $0.646 \pm 0.006$ & $6.725 \pm 0.017$ & $56.63 \pm 0.06$
