This script calculates the number or available requirements, tests and mappings for each of the subsets in the available datasets. It also formats the data to be used in Overleaf table.


In [39]:
import os
import pandas as pd

In [40]:
def read_requirements(directory: str) -> pd.DataFrame:
    file_path = os.path.join(directory, "RE.csv")
    return pd.read_csv(file_path, dtype=str, on_bad_lines="warn")

def read_tests(directory: str) -> pd.DataFrame:
    file_path = os.path.join(directory, "ST.csv")
    test_df = pd.read_csv(file_path, dtype=str, on_bad_lines="warn")
    
    # Some Purpose columns are intentionally left blank; populate them with empty strings
    test_df["Purpose"] = test_df["Purpose"].fillna("")
    return test_df

def read_mappings(directory: str) -> pd.DataFrame:
    file_path = os.path.join(directory, "mapping.csv")
    return pd.read_csv(file_path, dtype=str, on_bad_lines="warn")

In [41]:

datasets_dir = "."

latex_rows = []
latex_rows.append(r"\textbf{Subset} & \textbf{RE} & \textbf{ST} & \textbf{Tracelinks} & \textbf{n (Possible Links)} \\")

for dataset in os.listdir(datasets_dir):
    dataset_path = os.path.join(datasets_dir, dataset)
    if not os.path.isdir(dataset_path): continue

    latex_rows.append(f"{dataset} & & & & \\\\")
    print(f"Processing dataset: {dataset}")
    print(f"  Path: {dataset_path}")

    subset_dirs = sorted([
        subset for subset in os.listdir(dataset_path)
        if os.path.isdir(os.path.join(dataset_path, subset))
    ], key=lambda x: int(x))  # assumes subset names like "01", "02", ...

    for subset in subset_dirs:
        subset_path = os.path.join(dataset_path, subset)
        if not os.path.isdir(subset_path): continue
        
        print(f"Processing dataset: {dataset}, subset: {subset}")
        print(f"  Path: {subset_path}")
        requirements = read_requirements(subset_path)
        tests        = read_tests(subset_path)
        mappings     = read_mappings(subset_path)

        req_row_count = len(requirements.index)
        test_row_count = len(tests.index)
        mapping_row_count = len(mappings.index)
        possible_links = req_row_count * test_row_count

        print(f"  Requirements: {req_row_count} rows")
        print(f"  Tests: {test_row_count} rows")
        print(f"  Mappings: {mapping_row_count} rows")

        row = f"{subset} & {req_row_count} & {test_row_count} & {mapping_row_count} & {possible_links} \\\\"
        latex_rows.append(row)

        latex_table = "\n".join(latex_rows)
print("\nGenerated LaTeX Table:\n")
print(latex_table)


Processing dataset: Mozilla
  Path: ./Mozilla
Processing dataset: Mozilla, subset: 01
  Path: ./Mozilla/01
  Requirements: 25 rows
  Tests: 25 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 02
  Path: ./Mozilla/02
  Requirements: 25 rows
  Tests: 21 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 03
  Path: ./Mozilla/03
  Requirements: 25 rows
  Tests: 20 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 04
  Path: ./Mozilla/04
  Requirements: 25 rows
  Tests: 20 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 05
  Path: ./Mozilla/05
  Requirements: 25 rows
  Tests: 23 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 06
  Path: ./Mozilla/06
  Requirements: 25 rows
  Tests: 23 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 07
  Path: ./Mozilla/07
  Requirements: 25 rows
  Tests: 22 rows
  Mappings: 25 rows
Processing dataset: Mozilla, subset: 08
  Path: ./Mozilla/08
  Requirements: 25 rows
  Tests: 21 row