### Test Summary

This notebook demonstrates comprehensive testing of the CSV File Inspector script with various options:

1. Basic validation (non-existent file test)
2. Default information output
3. Column uniqueness report
4. Field listing
5. Sample rows display
6. Statistical analysis of fields
7. Unique values for specific columns
8. Output format tests (txt, markdown, LaTeX)
9. Column type separation
10. Grouped summary tables with different aggregation methods
11. Combined options for comprehensive analysis

The tests verify that all major functionalities of the script work correctly and produce the expected output formats.

### Dependencies

In [1]:
import subprocess
import os

from library import ROOT, is_valid_file, is_valid_directory

### Set constants

In [2]:
SCRIPT_PATH = os.path.join(ROOT, "core/src/utils/inspect_csv_file.py")
if not is_valid_file(SCRIPT_PATH):
    raise ValueError("Invalid Python script path.")

MOCK_CSV_PATH = os.path.join(
    ROOT,
    "core/tests/unit_tests/mock_data/valid_csv_files",
    "KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv",
)
if not is_valid_file(MOCK_CSV_PATH):
    raise ValueError("Invalid mock .csv script path.")

# Create output directory if it doesn't exist
OUTPUT_DIRECTORY = os.path.join(
    ROOT,
    "core/tests/integration_tests/test_inspect_csv_file_output",
)
if not is_valid_directory(OUTPUT_DIRECTORY):
    raise ValueError("Invalid mock .csv script path.")

### Helper function to run commands

In [3]:
def run_command(cmd, description):
    """Helper function to run a command and display output with a description"""
    print(f"\n{'='*80}")
    print(f"TESTING: {description}")
    print(f"COMMAND: {' '.join(cmd)}")
    print(f"{'='*80}\n")

    # Run the command and capture real-time output
    process = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
    )

    # Print output in real-time
    for line in process.stdout:
        print(line, end="")

    # Print any errors
    stdout, stderr = process.communicate()
    if stderr:
        print("\nERRORS:")
        print(stderr)

    # Check exit code
    print()
    print("-" * 50)
    if process.returncode==0:
        print(f"✅ Test '{description}' passed!")
    else:
        print(f"❌ Test '{description}' failed!")
    print("-" * 50)

### Test non-existent .csv file

In [4]:
# Define paths
nonexistent_csv_path = "/path/to/nonexistent/file.csv"  # This path doesn't exist

# Run the script with the non-existent file
cmd = ["python", SCRIPT_PATH, "-csv", nonexistent_csv_path]

# Execute the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Check if the error was properly returned
print(f"Return code: {result.returncode}")
print("\nSTDERR:")
print(result.stderr)
print("\nSTDOUT:")
print(result.stdout)

# Verify the script exited with a non-zero status for the error case
assert (
    result.returncode != 0
), "Script should fail with non-zero exit code for non-existent file"
print("✅ Test passed: Script correctly detected non-existent file")

Return code: 2

STDERR:
Usage: inspect_csv_file.py [OPTIONS]
Try 'inspect_csv_file.py --help' for help.

Error: Invalid value for '-csv' / '--csv_file_path': File '/path/to/nonexistent/file.csv' does not exist.


STDOUT:

✅ Test passed: Script correctly detected non-existent file


### Test default information output:

In [5]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "-v",
    "--output_filename",
    "test_default_information_output",
]

run_command(cmd, "Display default information output")


TESTING: Display default information output
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output -v --output_filename test_default_information_output


File Details
Successfully loaded CSV file: 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' 
from directory: '/nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files'.

Basic Information
DataFrame Shape: 48 rows × 29 columns
Columns with empty values:
  Adjusted_average_core_hours_per_spinor: 39 rows with empty values
Checking for unusual data types:
  No unusual data types detected in the CSV file.
   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generat

### Test uniqueness report

In [6]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--uniqueness_report",
    "--output_filename",
    "test_uniqueness_report",
]

run_command(cmd, "Show uniqueness report")


TESTING: Show uniqueness report
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --uniqueness_report --output_filename test_uniqueness_report

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Show uniqueness report' passed!
--------------------------------------------------


### Test list fields

In [7]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--list_fields",
    "--output_filename",
    "test_list_fields",
]

run_command(cmd, "Show list of fields")


TESTING: Show list of fields
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --list_fields --output_filename test_list_fields

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Show list of fields' passed!
--------------------------------------------------


### Test sample rows

In [8]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--sample_rows",
    "5",  # Show 5 sample rows
    "--output_filename",
    "test_sample_rows",
]

run_command(cmd, "Sample rows display")


TESTING: Sample rows display
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --sample_rows 5 --output_filename test_sample_rows

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Sample rows display' passed!
--------------------------------------------------


### Test field statistics

In [9]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--field_statistics",
    "--output_filename",
    "test_field_statistics",
]

run_command(cmd, "Field statistics")


TESTING: Field statistics
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --field_statistics --output_filename test_field_statistics

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Field statistics' passed!
--------------------------------------------------


### Test show unique values for a specific column

In [10]:
# First, let's list the fields to choose one for unique values test
fields_cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "--list_fields",
]

print("Fetching column names to use for unique values test:\n")
fields_process = subprocess.run(fields_cmd, capture_output=True, text=True)
print(fields_process.stdout)

# Let's assume we've identified a column name from the output above
# Replace 'column_name' with an actual column name from your CSV
column_name = "m"  # Update this with an actual column from your data

cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--show_unique_values",
    column_name,
    "--output_filename",
    "test_unique_values",
]

run_command(cmd, f"Show unique values for column '{column_name}'")

Fetching column names to use for unique values test:

Successfully loaded CSV file: 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' 
from directory: '/nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files'.
DataFrame Shape: 48 rows × 29 columns
Columns with empty values:
  Adjusted_average_core_hours_per_spinor: 39 rows with empty values
  No unusual data types detected in the CSV file.

Columns
Tunable Parameters:
  APE_alpha
  APE_iterations
  Bare_mass
  CG_epsilon
  Clover_coefficient
  Configuration_label
  KL_diagonal_order
  KL_scaling_factor
  Kernel_operator_type
  MPI_geometry
  MSCG_epsilon
  Main_program_type
  Number_of_spinors
  Number_of_vectors
  Overlap_operator_method
  QCD_beta_value
  Rho_value
  Threads_per_process
Output Quantities:
  Adjusted_average_core_hours_per_spinor
  Average_core_hours_per_spinor
  Average_number_of_MSCG_iterations_per_spinor
  Average_number_of_MV_multiplications_per_spinor
  Average_wall

### Test different output formats (Markdown)

In [11]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--output_format",
    "md",  # Use markdown format
    "--sample_rows",
    "3",
    "--field_statistics",
    "--output_filename",
    "test_markdown_output",
]

run_command(cmd, "Output in Markdown format")

# Display the content of the generated markdown file
md_file = os.path.join(OUTPUT_DIRECTORY, "test_markdown_output_summary.md")
if os.path.exists(md_file):
    print("\nGenerated Markdown file contents:")
    print("-" * 50)
    with open(md_file, "r") as f:
        print(f.read())
else:
    print(f"\nWarning: Could not find expected markdown file at {md_file}")


TESTING: Output in Markdown format
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --output_format md --sample_rows 3 --field_statistics --output_filename test_markdown_output

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Output in Markdown format' passed!
--------------------------------------------------

Generated Markdown file contents:
--------------------------------------------------
## File Details

Successfully loaded CSV file: 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' 
from directory: '/nvme/h/cy22sg1/qpb_data_analysis/core/unit_tests

### Test different output formats (LaTeX)

In [12]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--output_format",
    "tex",  # Use LaTeX format
    "--sample_rows",
    "3",
    "--field_statistics",
    "--output_filename",
    "test_latex_output",
]

run_command(cmd, "Output in LaTeX format")

# Display the content of the generated LaTeX file
tex_file = os.path.join(OUTPUT_DIRECTORY, "test_latex_output_summary.tex")
if os.path.exists(tex_file):
    print("\nGenerated LaTeX file contents (first 20 lines):")
    print("-" * 50)
    with open(tex_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i < 20:  # Show just the first 20 lines to avoid overwhelming output
                print(line, end="")
            else:
                print("... (more content omitted)")
                break
else:
    print(f"\nWarning: Could not find expected LaTeX file at {tex_file}")


TESTING: Output in LaTeX format
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --output_format tex --sample_rows 3 --field_statistics --output_filename test_latex_output

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Output in LaTeX format' passed!
--------------------------------------------------

Generated LaTeX file contents (first 20 lines):
--------------------------------------------------
\section{File Details}

Successfully loaded CSV file: 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' 
from directory: '/nvme/h/cy22sg1/qpb_data_analysis/co

### Test separate by type

In [13]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--uniqueness_report",
    "--separate_by_type",
    "--output_filename",
    "test_separate_by_type",
]

run_command(cmd, "Separate columns by type in uniqueness report")


TESTING: Separate columns by type in uniqueness report
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --uniqueness_report --separate_by_type --output_filename test_separate_by_type

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Separate columns by type in uniqueness report' passed!
--------------------------------------------------


### Test grouped summary tables

In [14]:
# First, let's list the fields to choose appropriate columns for the summary
fields_cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "--list_fields",
]

print("Fetching column names for grouped summary test:\n")
fields_process = subprocess.run(fields_cmd, capture_output=True, text=True)
print(fields_process.stdout)

# Replace these with actual column names from your CSV
value_variable = "CG_epsilon"  # A numeric column
row_variable = "m"  # A categorical column for rows
column_variable = "MSCG_epsilon"  # A categorical column for columns

cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--add_grouped_summary",
    "--value_variable",
    value_variable,
    "--row_variable",
    row_variable,
    "--column_variable",
    column_variable,
    "--aggregation",
    "mean",  # Calculate mean values
    "--output_filename",
    "test_grouped_summary",
]

run_command(cmd, "Grouped summary tables")

Fetching column names for grouped summary test:

Successfully loaded CSV file: 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' 
from directory: '/nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files'.
DataFrame Shape: 48 rows × 29 columns
Columns with empty values:
  Adjusted_average_core_hours_per_spinor: 39 rows with empty values
  No unusual data types detected in the CSV file.

Columns
Tunable Parameters:
  APE_alpha
  APE_iterations
  Bare_mass
  CG_epsilon
  Clover_coefficient
  Configuration_label
  KL_diagonal_order
  KL_scaling_factor
  Kernel_operator_type
  MPI_geometry
  MSCG_epsilon
  Main_program_type
  Number_of_spinors
  Number_of_vectors
  Overlap_operator_method
  QCD_beta_value
  Rho_value
  Threads_per_process
Output Quantities:
  Adjusted_average_core_hours_per_spinor
  Average_core_hours_per_spinor
  Average_number_of_MSCG_iterations_per_spinor
  Average_number_of_MV_multiplications_per_spinor
  Average_wall_cloc

### Test multiple options together

In [15]:
cmd = [
    "python",
    SCRIPT_PATH,
    "-csv",
    MOCK_CSV_PATH,
    "-out",
    OUTPUT_DIRECTORY,
    "--list_fields",
    "--sample_rows",
    "3",
    "--uniqueness_report",
    "--field_statistics",
    "--output_format",
    "md",
    "--output_filename",
    "test_comprehensive_output",
]

run_command(cmd, "Combined analysis with multiple options")


TESTING: Combined analysis with multiple options
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --list_fields --sample_rows 3 --uniqueness_report --field_statistics --output_format md --output_filename test_comprehensive_output

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Combined analysis with multiple options' passed!
--------------------------------------------------


### Test different aggregation methods

In [16]:
# Test each available aggregation method
aggregation_methods = ["count", "min", "max", "mean"]

for agg_method in aggregation_methods:
    cmd = [
        "python",
        SCRIPT_PATH,
        "-csv",
        MOCK_CSV_PATH,
        "-out",
        OUTPUT_DIRECTORY,
        "--add_grouped_summary",
        "--value_variable",
        value_variable,
        "--row_variable",
        row_variable,
        "--column_variable",
        column_variable,
        "--aggregation",
        agg_method,
        "--output_filename",
        f"test_aggregation_{agg_method}",
    ]

    run_command(cmd, f"Grouped summary with {agg_method} aggregation")


TESTING: Grouped summary with count aggregation
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/tests/unit_tests/mock_data/valid_csv_files/KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv -out /nvme/h/cy22sg1/qpb_data_analysis/core/tests/integration_tests/test_inspect_csv_file_output --add_grouped_summary --value_variable CG_epsilon --row_variable m --column_variable MSCG_epsilon --aggregation count --output_filename test_aggregation_count

   -- Summary of the 'KL_several_m_varying_EpsCG_and_EpsMSCG_processed_parameter_values.csv' CSV file generated.

--------------------------------------------------
✅ Test 'Grouped summary with count aggregation' passed!
--------------------------------------------------

TESTING: Grouped summary with min aggregation
COMMAND: python /nvme/h/cy22sg1/qpb_data_analysis/core/src/utils/inspect_csv_file.py -csv /nvme/h/cy22sg1/qpb_data_analysis/core/test