From 29e3e3ac13d52a77e2d059a3e4cc79d063e8e893 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= <tom.francois@ntymail.com>
Date: Fri, 17 Apr 2026 15:39:19 +0200
Subject: [PATCH 1/3] feat: add possibility to generate codemeta using
 rsmetacheck cli

---
 src/rsmetacheck/cli.py       | 82 ++++++++++++++++++++++++++++--------
 src/rsmetacheck/run_somef.py | 61 +++++++++++++++++++++++----
 2 files changed, 117 insertions(+), 26 deletions(-)

diff --git a/src/rsmetacheck/cli.py b/src/rsmetacheck/cli.py
index 0d338b4..a3850d7 100644
--- a/src/rsmetacheck/cli.py
+++ b/src/rsmetacheck/cli.py
@@ -1,59 +1,76 @@
 import argparse
 import os
 from pathlib import Path
-from rsmetacheck.run_somef import run_somef_batch, run_somef_single, ensure_somef_configured
+
 from rsmetacheck.run_analyzer import run_analysis
+from rsmetacheck.run_somef import (
+    ensure_somef_configured,
+    run_somef_batch,
+    run_somef_single,
+)
 
 
 def cli():
-    parser = argparse.ArgumentParser(description="Detect metadata pitfalls in software repositories using SoMEF.")
+    parser = argparse.ArgumentParser(
+        description="Detect metadata pitfalls in software repositories using SoMEF."
+    )
     parser.add_argument(
         "--input",
         nargs="+",
         required=True,
-        help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef."
+        help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef.",
     )
     parser.add_argument(
         "--skip-somef",
         action="store_true",
-        help="Skip SoMEF execution and analyze existing SoMEF output files directly. --input should point to SoMEF JSON files."
+        help="Skip SoMEF execution and analyze existing SoMEF output files directly. --input should point to SoMEF JSON files.",
     )
     parser.add_argument(
         "--pitfalls-output",
         default=os.path.join(os.getcwd(), "pitfalls_outputs"),
-        help="Directory to store pitfall JSON-LD files (default: ./pitfalls_outputs)."
+        help="Directory to store pitfall JSON-LD files (default: ./pitfalls_outputs).",
     )
     parser.add_argument(
         "--somef-output",
         default=os.path.join(os.getcwd(), "somef_outputs"),
-        help="Directory to store SoMEF output files (default: ./somef_outputs)."
+        help="Directory to store SoMEF output files (default: ./somef_outputs).",
     )
     parser.add_argument(
         "--analysis-output",
         default=os.path.join(os.getcwd(), "analysis_results.json"),
-        help="File path for summary results (default: ./analysis_results.json)."
+        help="File path for summary results (default: ./analysis_results.json).",
     )
     parser.add_argument(
         "--threshold",
         type=float,
         default=0.8,
-        help="SoMEF confidence threshold (default: 0.8). Only used when running SoMEF."
+        help="SoMEF confidence threshold (default: 0.8). Only used when running SoMEF.",
+    )
+    parser.add_argument(
+        "-b",
+        "--branch",
+        help="Branch of the repository to analyze. Overrides the default branch. Only used when running SoMEF.",
     )
+
     parser.add_argument(
-        "-b", "--branch",
-        help="Branch of the repository to analyze. Overrides the default branch. Only used when running SoMEF."
+        "-c",
+        "--generate-codemeta",
+        action="store_true",
+        help="Generate codemeta files for each repository. Only used when running SoMEF.",
     )
 
     parser.add_argument(
         "--verbose",
         action="store_true",
-        help="Include both detected AND undetected pitfalls in the output JSON-LD."
+        help="Include both detected AND undetected pitfalls in the output JSON-LD.",
     )
 
     args = parser.parse_args()
 
     if args.skip_somef:
-        print(f"Skipping SoMEF execution. Analyzing {len(args.input)} existing SoMEF output files...")
+        print(
+            f"Skipping SoMEF execution. Analyzing {len(args.input)} existing SoMEF output files..."
+        )
 
         somef_json_paths = []
         for json_path in args.input:
@@ -67,29 +84,58 @@ def cli():
             return
 
         print(f"Analyzing {len(somef_json_paths)} SoMEF output files...")
-        run_analysis(somef_json_paths, args.pitfalls_output, args.analysis_output, verbose=args.verbose)
+        run_analysis(
+            somef_json_paths,
+            args.pitfalls_output,
+            args.analysis_output,
+            verbose=args.verbose,
+        )
 
     else:
         ensure_somef_configured()
 
         threshold = args.threshold
         somef_output_dir = args.somef_output
+        generate_codemeta = args.generate_codemeta
 
         print(f"Detected {len(args.input)} input(s):")
+        if generate_codemeta:
+            print(
+                "Codemeta generation is ENABLED. Codemeta files will be created for each repository."
+            )
 
         for input_item in args.input:
             if input_item.startswith("http://") or input_item.startswith("https://"):
                 print(f"Processing repository URL: {input_item}")
-                run_somef_single(input_item, somef_output_dir, threshold, branch=args.branch)
+                run_somef_single(
+                    input_item,
+                    somef_output_dir,
+                    threshold,
+                    branch=args.branch,
+                    generate_codemeta=generate_codemeta,
+                )
             elif os.path.exists(input_item):
                 print(f"Processing repositories from file: {input_item}")
-                run_somef_batch(input_item, somef_output_dir, threshold, branch=args.branch)
+                run_somef_batch(
+                    input_item,
+                    somef_output_dir,
+                    threshold,
+                    branch=args.branch,
+                    generate_codemeta=generate_codemeta,
+                )
             else:
-                print(f"Warning: Skipping invalid input (not a URL or existing file): {input_item}")
+                print(
+                    f"Warning: Skipping invalid input (not a URL or existing file): {input_item}"
+                )
 
         print(f"\nRunning analysis on outputs in {somef_output_dir}...")
-        run_analysis(somef_output_dir, args.pitfalls_output, args.analysis_output, verbose=args.verbose)
+        run_analysis(
+            somef_output_dir,
+            args.pitfalls_output,
+            args.analysis_output,
+            verbose=args.verbose,
+        )
 
 
 if __name__ == "__main__":
-    cli()
\ No newline at end of file
+    cli()
diff --git a/src/rsmetacheck/run_somef.py b/src/rsmetacheck/run_somef.py
index 121899e..a2172a3 100644
--- a/src/rsmetacheck/run_somef.py
+++ b/src/rsmetacheck/run_somef.py
@@ -1,9 +1,11 @@
-import os
 import json
+import os
 import subprocess
-
 from pathlib import Path
 
+CODEMETA_DEFAULT_NAME = "somef_generated_codemeta"
+
+
 def ensure_somef_configured():
     """Run 'somef configure -a' only if it hasn't been configured yet."""
     config_file = Path.home() / ".somef" / "config.json"
@@ -18,11 +20,14 @@ def ensure_somef_configured():
             return False
     return True
 
-def run_somef(repo_url, output_file, threshold, branch=None):
+
+def run_somef(repo_url, output_file, threshold, branch=None, codemeta_file=None):
     """Run SoMEF on a given repository and save results."""
     cmd = ["somef", "describe", "-r", repo_url, "-o", output_file, "-t", str(threshold)]
     if branch:
         cmd.extend(["-b", branch])
+    if codemeta_file:
+        cmd.extend(["-c", codemeta_file])
     try:
         subprocess.run(cmd, check=True)
         print(f"SoMEF finished for: {repo_url}")
@@ -31,16 +36,38 @@ def run_somef(repo_url, output_file, threshold, branch=None):
         print(f"Error running SoMEF for {repo_url}: {e}")
         return False
 
-def run_somef_single(repo_url, output_dir="somef_outputs", threshold=0.8, branch=None):
+
+def run_somef_single(
+    repo_url,
+    output_dir="somef_outputs",
+    threshold=0.8,
+    branch=None,
+    generate_codemeta=False,
+):
     """Run SoMEF for a single repository."""
     os.makedirs(output_dir, exist_ok=True)
     output_file = os.path.join(output_dir, "output_1.json")
+    codemeta_file = os.path.join(output_dir, CODEMETA_DEFAULT_NAME + ".json")
 
     print(f"Running SoMEF for {repo_url}...")
-    success = run_somef(repo_url, output_file, threshold, branch)
+
+    success = run_somef(
+        repo_url,
+        output_file,
+        threshold,
+        branch,
+        codemeta_file=codemeta_file if generate_codemeta else None,
+    )
     return output_dir if success else None
 
-def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8, branch=None):
+
+def run_somef_batch(
+    json_file,
+    output_dir="somef_outputs",
+    threshold=0.8,
+    branch=None,
+    generate_codemeta=False,
+):
     """Run SoMEF for all repositories listed in a JSON file."""
     os.makedirs(output_dir, exist_ok=True)
 
@@ -57,8 +84,26 @@ def run_somef_batch(json_file, output_dir="somef_outputs", threshold=0.8, branch
 
     for idx, repo_url in enumerate(repos, start=1):
         output_file = os.path.join(output_dir, f"{base_name}_output_{idx}.json")
+        codemeta_file = os.path.join(
+            output_dir, f"{base_name}_{CODEMETA_DEFAULT_NAME}_{idx}.json"
+        )
         print(f"[{idx}/{len(repos)}] {repo_url}")
-        run_somef(repo_url, output_file, threshold, branch)
+        run_somef(
+            repo_url,
+            output_file,
+            threshold,
+            branch,
+            codemeta_file=codemeta_file if generate_codemeta else None,
+        )
 
     print(f"Completed SoMEF for {base_name}. Results in {output_dir}")
-    return True
\ No newline at end of file
+    return True
+
+    success = run_somef(
+        repo_url,
+        output_file,
+        threshold,
+        branch,
+        codemeta_file=codemeta_file if generate_codemeta else None,
+    )
+    return output_dir if success else None

From 086a767a10cf744e0dd3c21389e59275f6ced83c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= <tom.francois@ntymail.com>
Date: Fri, 17 Apr 2026 16:20:28 +0200
Subject: [PATCH 2/3] fix: add a filter in analysis to avoid reading generated
 codemeta.json as potential somef output files.

---
 src/rsmetacheck/detect_pitfalls_main.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/rsmetacheck/detect_pitfalls_main.py b/src/rsmetacheck/detect_pitfalls_main.py
index e583837..e32b245 100644
--- a/src/rsmetacheck/detect_pitfalls_main.py
+++ b/src/rsmetacheck/detect_pitfalls_main.py
@@ -1,6 +1,7 @@
 import json
 from pathlib import Path
 from typing import Iterable, Union
+from rsmetacheck.run_somef import CODEMETA_DEFAULT_NAME
 from rsmetacheck.utils.pitfall_utils import extract_programming_languages
 from rsmetacheck.utils.json_ld_utils import create_pitfall_jsonld, save_individual_pitfall_jsonld
 from rsmetacheck.utils.somef_compat import normalize_somef_data
@@ -467,7 +468,10 @@ def main(input_dir=None, somef_json_paths=None, pitfalls_dir=None, analysis_outp
         if not input_dir.exists():
             print(f"Error: Directory not found: {input_dir}")
             return
-        json_files = list(input_dir.glob("*.json"))
+        json_files = [
+            f for f in input_dir.glob("*.json")
+            if not f.stem.endswith(CODEMETA_DEFAULT_NAME)
+        ]
         print(f"Found {len(json_files)} JSON files in {input_dir}")
     else:
         print("Error: No input directory or JSON file list provided.")

From 267ecbbd9586612504a29f82ec93e0d6360ff51d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Fran=C3=A7ois?= <tom.francois@ntymail.com>
Date: Fri, 17 Apr 2026 16:21:38 +0200
Subject: [PATCH 3/3] test: add unit test to mock the somef calls when
 --generate-codemeta flag is set

---
 tests/test_cli.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 tests/test_cli.py

diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..762885d
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,72 @@
+"""Unit tests to verify CLI behavior for codemeta generation."""
+
+import importlib
+from unittest.mock import MagicMock
+
+cli_module = importlib.import_module("rsmetacheck.cli")
+
+
+REPO_URL = "https://github.com/SoftwareUnderstanding/sw-metadata-bot"
+
+
+def test_cli_with_generate_codemeta_adds_codemeta_output(monkeypatch, tmp_path):
+    """Ensure --generate-codemeta requests codemeta output in SoMEF command."""
+    somef_output_dir = tmp_path / "somef_outputs"
+    expected_codemeta = str(somef_output_dir / "somef_generated_codemeta.json")
+
+    run_analysis_mock = MagicMock()
+    subprocess_run_mock = MagicMock()
+
+    monkeypatch.setattr(
+        "sys.argv",
+        [
+            "rsmetacheck",
+            "--input",
+            REPO_URL,
+            "--somef-output",
+            str(somef_output_dir),
+            "--generate-codemeta",
+        ],
+    )
+    monkeypatch.setattr(cli_module, "ensure_somef_configured", lambda: True)
+    monkeypatch.setattr(cli_module, "run_analysis", run_analysis_mock)
+    monkeypatch.setattr("rsmetacheck.run_somef.subprocess.run", subprocess_run_mock)
+
+    cli_module.cli()
+
+    command = subprocess_run_mock.call_args.args[0]
+    assert command[0:2] == ["somef", "describe"]
+    assert "-c" in command
+    assert expected_codemeta in command
+
+    run_analysis_mock.assert_called_once()
+
+
+def test_cli_without_generate_codemeta_keeps_default_behavior(monkeypatch, tmp_path):
+    """Ensure default CLI call does not request codemeta output from SoMEF."""
+    somef_output_dir = tmp_path / "somef_outputs"
+
+    run_analysis_mock = MagicMock()
+    subprocess_run_mock = MagicMock()
+
+    monkeypatch.setattr(
+        "sys.argv",
+        [
+            "rsmetacheck",
+            "--input",
+            REPO_URL,
+            "--somef-output",
+            str(somef_output_dir),
+        ],
+    )
+    monkeypatch.setattr(cli_module, "ensure_somef_configured", lambda: True)
+    monkeypatch.setattr(cli_module, "run_analysis", run_analysis_mock)
+    monkeypatch.setattr("rsmetacheck.run_somef.subprocess.run", subprocess_run_mock)
+
+    cli_module.cli()
+
+    command = subprocess_run_mock.call_args.args[0]
+    assert command[0:2] == ["somef", "describe"]
+    assert "-c" not in command
+
+    run_analysis_mock.assert_called_once()