From 29e3e3ac13d52a77e2d059a3e4cc79d063e8e893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= Date: Fri, 17 Apr 2026 15:39:19 +0200 Subject: [PATCH 1/3] feat: add possibility to generate codemeta using rsmetacheck cli --- src/rsmetacheck/cli.py | 82 ++++++++++++++++++++++++++++-------- src/rsmetacheck/run_somef.py | 61 +++++++++++++++++++++++---- 2 files changed, 117 insertions(+), 26 deletions(-) diff --git a/src/rsmetacheck/cli.py b/src/rsmetacheck/cli.py index 0d338b4..a3850d7 100644 --- a/src/rsmetacheck/cli.py +++ b/src/rsmetacheck/cli.py @@ -1,59 +1,76 @@ import argparse import os from pathlib import Path -from rsmetacheck.run_somef import run_somef_batch, run_somef_single, ensure_somef_configured + from rsmetacheck.run_analyzer import run_analysis +from rsmetacheck.run_somef import ( + ensure_somef_configured, + run_somef_batch, + run_somef_single, +) def cli(): - parser = argparse.ArgumentParser(description="Detect metadata pitfalls in software repositories using SoMEF.") + parser = argparse.ArgumentParser( + description="Detect metadata pitfalls in software repositories using SoMEF." + ) parser.add_argument( "--input", nargs="+", required=True, - help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef." + help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef.", ) parser.add_argument( "--skip-somef", action="store_true", - help="Skip SoMEF execution and analyze existing SoMEF output files directly. --input should point to SoMEF JSON files." + help="Skip SoMEF execution and analyze existing SoMEF output files directly. --input should point to SoMEF JSON files.", ) parser.add_argument( "--pitfalls-output", default=os.path.join(os.getcwd(), "pitfalls_outputs"), - help="Directory to store pitfall JSON-LD files (default: ./pitfalls_outputs)." + help="Directory to store pitfall JSON-LD files (default: ./pitfalls_outputs).", ) parser.add_argument( "--somef-output", default=os.path.join(os.getcwd(), "somef_outputs"), - help="Directory to store SoMEF output files (default: ./somef_outputs)." + help="Directory to store SoMEF output files (default: ./somef_outputs).", ) parser.add_argument( "--analysis-output", default=os.path.join(os.getcwd(), "analysis_results.json"), - help="File path for summary results (default: ./analysis_results.json)." + help="File path for summary results (default: ./analysis_results.json).", ) parser.add_argument( "--threshold", type=float, default=0.8, - help="SoMEF confidence threshold (default: 0.8). Only used when running SoMEF." + help="SoMEF confidence threshold (default: 0.8). Only used when running SoMEF.", + ) + parser.add_argument( + "-b", + "--branch", + help="Branch of the repository to analyze. Overrides the default branch. Only used when running SoMEF.", ) + parser.add_argument( - "-b", "--branch", - help="Branch of the repository to analyze. Overrides the default branch. Only used when running SoMEF." + "-c", + "--generate-codemeta", + action="store_true", + help="Generate codemeta files for each repository. Only used when running SoMEF.", ) parser.add_argument( "--verbose", action="store_true", - help="Include both detected AND undetected pitfalls in the output JSON-LD." + help="Include both detected AND undetected pitfalls in the output JSON-LD.", ) args = parser.parse_args() if args.skip_somef: - print(f"Skipping SoMEF execution. Analyzing {len(args.input)} existing SoMEF output files...") + print( + f"Skipping SoMEF execution. Analyzing {len(args.input)} existing SoMEF output files..." + ) somef_json_paths = [] for json_path in args.input: @@ -67,29 +84,58 @@ def cli(): return print(f"Analyzing {len(somef_json_paths)} SoMEF output files...") - run_analysis(somef_json_paths, args.pitfalls_output, args.analysis_output, verbose=args.verbose) + run_analysis( + somef_json_paths, + args.pitfalls_output, + args.analysis_output, + verbose=args.verbose, + ) else: ensure_somef_configured() threshold = args.threshold somef_output_dir = args.somef_output + generate_codemeta = args.generate_codemeta print(f"Detected {len(args.input)} input(s):") + if generate_codemeta: + print( + "Codemeta generation is ENABLED. Codemeta files will be created for each repository." + ) for input_item in args.input: if input_item.startswith("http://") or input_item.startswith("https://"): print(f"Processing repository URL: {input_item}") - run_somef_single(input_item, somef_output_dir, threshold, branch=args.branch) + run_somef_single( + input_item, + somef_output_dir, + threshold, + branch=args.branch, + generate_codemeta=generate_codemeta, + ) elif os.path.exists(input_item): print(f"Processing repositories from file: {input_item}") - run_somef_batch(input_item, somef_output_dir, threshold, branch=args.branch) + run_somef_batch( + input_item, + somef_output_dir, + threshold, + branch=args.branch, + generate_codemeta=generate_codemeta, + ) else: - print(f"Warning: Skipping invalid input (not a URL or existing file): {input_item}") + print( + f"Warning: Skipping invalid input (not a URL or existing file): {input_item}" + ) print(f"\nRunning analysis on outputs in {somef_output_dir}...") - run_analysis(somef_output_dir, args.pitfalls_output, args.analysis_output, verbose=args.verbose) + run_analysis( + somef_output_dir, + args.pitfalls_output, + args.analysis_output, + verbose=args.verbose, + ) if __name__ == "__main__": - cli() \ No newline at end of file + cli() diff --git a/src/rsmetacheck/run_somef.py b/src/rsmetacheck/run_somef.py index 121899e..a2172a3 100644 --- a/src/rsmetacheck/run_somef.py +++ b/src/rsmetacheck/run_somef.py @@ -1,9 +1,11 @@ -import os import json +import os import subprocess - from pathlib import Path +CODEMETA_DEFAULT_NAME = "somef_generated_codemeta" + + def ensure_somef_configured(): """Run 'somef configure -a' only if it hasn't been configured yet.""" config_file = Path.home() / ".somef" / "config.json" @@ -18,11 +20,14 @@ def ensure_somef_configured(): return False return True -def run_somef(repo_url, output_file, threshold, branch=None): + +def run_somef(repo_url, output_file, threshold, branch=None, codemeta_file=None): """Run SoMEF on a given repository and save results.""" cmd = ["somef", "describe", "-r", repo_url, "-o", output_file, "-t", str(threshold)] if branch: cmd.extend(["-b", branch]) + if codemeta_file: + cmd.extend(["-c", codemeta_file]) try: subprocess.run(cmd, check=True) print(f"SoMEF finished for: {repo_url}") @@ -31,16 +36,38 @@ def run_somef(repo_url, output_file, threshold, branch=None): print(f"Error running SoMEF for {repo_url}: {e}") return False -def run_somef_single(repo_url, output_dir="somef_outputs", threshold=0.8, branch=None): + +def run_somef_single( + repo_url, + output_dir="somef_outputs", + threshold=0.8, + branch=None, + generate_codemeta=False, +): """Run SoMEF for a single repository.""" os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, "output_1.json") + codemeta_file = os.path.join(output_dir, CODEMETA_DEFAULT_NAME + ".json") print(f"Running SoMEF for {repo_url}...") - success = run_somef(repo_url, output_file, threshold, branch) + + success = run_somef( + repo_url, + output_file, + threshold, + branch, + codemeta_file=codemeta_file if generate_codemeta else None, + ) return output_dir if success else None -def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8, branch=None): + +def run_somef_batch( + json_file, + output_dir="somef_outputs", + threshold=0.8, + branch=None, + generate_codemeta=False, +): """Run SoMEF for all repositories listed in a JSON file.""" os.makedirs(output_dir, exist_ok=True) @@ -57,8 +84,26 @@ def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8, branch for idx, repo_url in enumerate(repos, start=1): output_file = os.path.join(output_dir, f"{base_name}_output_{idx}.json") + codemeta_file = os.path.join( + output_dir, f"{base_name}_{CODEMETA_DEFAULT_NAME}_{idx}.json" + ) print(f"[{idx}/{len(repos)}] {repo_url}") - run_somef(repo_url, output_file, threshold, branch) + run_somef( + repo_url, + output_file, + threshold, + branch, + codemeta_file=codemeta_file if generate_codemeta else None, + ) print(f"Completed SoMEF for {base_name}. Results in {output_dir}") - return True \ No newline at end of file + return True + + success = run_somef( + repo_url, + output_file, + threshold, + branch, + codemeta_file=codemeta_file if generate_codemeta else None, + ) + return output_dir if success else None From 086a767a10cf744e0dd3c21389e59275f6ced83c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= Date: Fri, 17 Apr 2026 16:20:28 +0200 Subject: [PATCH 2/3] fix: add a filter in analysis to avoid reading generated codemeta.json as potential somef output files. --- src/rsmetacheck/detect_pitfalls_main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/rsmetacheck/detect_pitfalls_main.py b/src/rsmetacheck/detect_pitfalls_main.py index e583837..e32b245 100644 --- a/src/rsmetacheck/detect_pitfalls_main.py +++ b/src/rsmetacheck/detect_pitfalls_main.py @@ -1,6 +1,7 @@ import json from pathlib import Path from typing import Iterable, Union +from rsmetacheck.run_somef import CODEMETA_DEFAULT_NAME from rsmetacheck.utils.pitfall_utils import extract_programming_languages from rsmetacheck.utils.json_ld_utils import create_pitfall_jsonld, save_individual_pitfall_jsonld from rsmetacheck.utils.somef_compat import normalize_somef_data @@ -467,7 +468,10 @@ def main(input_dir=None, somef_json_paths=None, pitfalls_dir=None, analysis_outp if not input_dir.exists(): print(f"Error: Directory not found: {input_dir}") return - json_files = list(input_dir.glob("*.json")) + json_files = [ + f for f in input_dir.glob("*.json") + if not f.stem.endswith(CODEMETA_DEFAULT_NAME) + ] print(f"Found {len(json_files)} JSON files in {input_dir}") else: print("Error: No input directory or JSON file list provided.") From 267ecbbd9586612504a29f82ec93e0d6360ff51d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= Date: Fri, 17 Apr 2026 16:21:38 +0200 Subject: [PATCH 3/3] test: add unit test to mock the somef calls when --generate-codemeta flag is set --- tests/test_cli.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 tests/test_cli.py diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..762885d --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,72 @@ +"""Unit tests to verify CLI behavior for codemeta generation.""" + +import importlib +from unittest.mock import MagicMock + +cli_module = importlib.import_module("rsmetacheck.cli") + + +REPO_URL = "https://github.com/SoftwareUnderstanding/sw-metadata-bot" + + +def test_cli_with_generate_codemeta_adds_codemeta_output(monkeypatch, tmp_path): + """Ensure --generate-codemeta requests codemeta output in SoMEF command.""" + somef_output_dir = tmp_path / "somef_outputs" + expected_codemeta = str(somef_output_dir / "somef_generated_codemeta.json") + + run_analysis_mock = MagicMock() + subprocess_run_mock = MagicMock() + + monkeypatch.setattr( + "sys.argv", + [ + "rsmetacheck", + "--input", + REPO_URL, + "--somef-output", + str(somef_output_dir), + "--generate-codemeta", + ], + ) + monkeypatch.setattr(cli_module, "ensure_somef_configured", lambda: True) + monkeypatch.setattr(cli_module, "run_analysis", run_analysis_mock) + monkeypatch.setattr("rsmetacheck.run_somef.subprocess.run", subprocess_run_mock) + + cli_module.cli() + + command = subprocess_run_mock.call_args.args[0] + assert command[0:2] == ["somef", "describe"] + assert "-c" in command + assert expected_codemeta in command + + run_analysis_mock.assert_called_once() + + +def test_cli_without_generate_codemeta_keeps_default_behavior(monkeypatch, tmp_path): + """Ensure default CLI call does not request codemeta output from SoMEF.""" + somef_output_dir = tmp_path / "somef_outputs" + + run_analysis_mock = MagicMock() + subprocess_run_mock = MagicMock() + + monkeypatch.setattr( + "sys.argv", + [ + "rsmetacheck", + "--input", + REPO_URL, + "--somef-output", + str(somef_output_dir), + ], + ) + monkeypatch.setattr(cli_module, "ensure_somef_configured", lambda: True) + monkeypatch.setattr(cli_module, "run_analysis", run_analysis_mock) + monkeypatch.setattr("rsmetacheck.run_somef.subprocess.run", subprocess_run_mock) + + cli_module.cli() + + command = subprocess_run_mock.call_args.args[0] + assert command[0:2] == ["somef", "describe"] + assert "-c" not in command + + run_analysis_mock.assert_called_once()