From 42fb4578b13278750eec2a3cb491df0acebabc7a Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Tue, 1 Apr 2025 12:10:15 -0700 Subject: [PATCH 1/3] Added de-dupe logic for non types results --- socketdev/core/dedupe.py | 114 ++++++++++++++++++++++++++++++++ socketdev/fullscans/__init__.py | 9 +-- socketdev/purl/__init__.py | 4 +- socketdev/version.py | 2 +- 4 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 socketdev/core/dedupe.py diff --git a/socketdev/core/dedupe.py b/socketdev/core/dedupe.py new file mode 100644 index 0000000..f9f737e --- /dev/null +++ b/socketdev/core/dedupe.py @@ -0,0 +1,114 @@ +from collections import defaultdict +from typing import Dict, List, Any + + +class Dedupe: + @staticmethod + def normalize_file_path(path: str) -> str: + return path.split("/", 1)[-1] if path and "/" in path else path or "" + + @staticmethod + def alert_key(alert: dict) -> tuple: + return ( + alert["type"], + alert["severity"], + alert["category"], + Dedupe.normalize_file_path(alert.get("file")), + alert.get("start"), + alert.get("end") + ) + + @staticmethod + def consolidate_and_merge_alerts(package_group: List[Dict[str, Any]]) -> Dict[str, Any]: + def alert_identity(alert: dict) -> tuple: + return ( + alert["type"], + alert["severity"], + alert["category"], + Dedupe.normalize_file_path(alert.get("file")), + alert.get("start"), + alert.get("end") + ) + + alert_map: Dict[tuple, dict] = {} + releases = set() + for pkg in package_group: + release = pkg.get("release") if pkg.get("release") is not None else pkg.get("type") + releases.add(release) + + for alert in pkg.get("alerts", []): + identity = alert_identity(alert) + file = Dedupe.normalize_file_path(alert.get("file")) + + if identity not in alert_map: + alert_map[identity] = { + "key": alert["key"], # keep the first key seen + "type": alert["type"], + "severity": alert["severity"], + "category": alert["category"], + "file": file, + "start": alert.get("start"), + "end": alert.get("end"), + "releases": [release] + } + else: + if release not in alert_map[identity]["releases"]: + alert_map[identity]["releases"].append(release) + + base = package_group[0] + return { + "id": base.get("id"), + "author": base.get("author"), + "size": base.get("size"), + "type": base.get("type"), + "name": base.get("name"), + "namespace": base.get("namespace"), + "version": base.get("version"), + "releases": sorted(releases), + "alerts": list(alert_map.values()), + "score": base.get("score", {}), + "license": base.get("license"), + "licenseDetails": base.get("licenseDetails", []), + "batchIndex": base.get("batchIndex"), + "purl": f"pkg:{base.get('type', 'unknown')}/{base.get('name', 'unknown')}@{base.get('version', '0.0.0')}" + } + + @staticmethod + def dedupe(packages: List[Dict[str, Any]], batched: bool = True) -> List[Dict[str, Any]]: + if batched: + grouped = Dedupe.consolidate_by_batch_index(packages) + else: + grouped = Dedupe.consolidate_by_order(packages) + return [Dedupe.consolidate_and_merge_alerts(group) for group in grouped.values()] + + @staticmethod + def consolidate_by_batch_index(packages: List[Dict[str, Any]]) -> dict[int, list[dict[str, Any]]]: + grouped: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + for pkg in packages: + grouped[pkg["batchIndex"]].append(pkg) + return grouped + + @staticmethod + def consolidate_by_order(packages: List[Dict[str, Any]]) -> dict[int, list[dict[str, Any]]]: + grouped: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + batch_index = 0 + package_purl = None + try: + for pkg in packages: + name = pkg["name"] + version = pkg["version"] + namespace = pkg.get("namespace") + ecosystem = pkg.get("type") + new_purl = f"pkg:{ecosystem}/" + if namespace: + new_purl += f"{namespace}/" + new_purl += f"{name}@{version}" + if package_purl is None: + package_purl = new_purl + if package_purl != new_purl: + batch_index += 1 + pkg["batchIndex"] = batch_index + grouped[pkg["batchIndex"]].append(pkg) + except Exception as error: + print(error) + return grouped \ No newline at end of file diff --git a/socketdev/fullscans/__init__.py b/socketdev/fullscans/__init__.py index 8af1b16..1870035 100644 --- a/socketdev/fullscans/__init__.py +++ b/socketdev/fullscans/__init__.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union from dataclasses import dataclass, asdict, field import urllib.parse - +from core.dedupe import Dedupe from ..utils import IntegrationType, Utils log = logging.getLogger("socketdev") @@ -712,6 +712,7 @@ def get(self, org_slug: str, params: dict, use_types: bool = False) -> Union[dic result = response.json() if use_types: return GetFullScanMetadataResponse.from_dict({"success": True, "status": 200, "data": result}) + return result error_message = response.json().get("error", {}).get("message", "Unknown error") @@ -803,9 +804,9 @@ def stream(self, org_slug: str, full_scan_id: str, use_types: bool = False) -> U if line != '"' and line != "" and line is not None: item = json.loads(line) stream_str.append(item) - for val in stream_str: - artifacts[val["id"]] = val - + stream_deduped = Dedupe.dedupe(stream_str, batched=False) + for batch in stream_deduped: + artifacts[batch["id"]] = batch if use_types: return FullScanStreamResponse.from_dict({"success": True, "status": 200, "artifacts": artifacts}) return artifacts diff --git a/socketdev/purl/__init__.py b/socketdev/purl/__init__.py index 5c25c02..3fb53fd 100644 --- a/socketdev/purl/__init__.py +++ b/socketdev/purl/__init__.py @@ -1,6 +1,7 @@ import json import urllib.parse from socketdev.log import log +from core.dedupe import Dedupe class Purl: @@ -32,7 +33,8 @@ def post(self, license: str = "false", components: list = None, **kwargs) -> lis purl.append(item) except json.JSONDecodeError: continue - return purl + purl_deduped = Dedupe.dedupe(purl) + return purl_deduped log.error(f"Error posting {components} to the Purl API: {response.status_code}") print(response.text) diff --git a/socketdev/version.py b/socketdev/version.py index f7d7e4b..521eb45 100644 --- a/socketdev/version.py +++ b/socketdev/version.py @@ -1 +1 @@ -__version__ = "2.0.16" +__version__ = "2.0.17" From 4872823cab52b212bfd86cb33110c7a5d0ac66e0 Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Tue, 1 Apr 2025 12:24:16 -0700 Subject: [PATCH 2/3] Version bump --- socketdev/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/socketdev/version.py b/socketdev/version.py index 521eb45..6f7f903 100644 --- a/socketdev/version.py +++ b/socketdev/version.py @@ -1 +1 @@ -__version__ = "2.0.17" +__version__ = "2.0.20" From 084dcdde0068678b0682803f872c752155b678b1 Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Tue, 1 Apr 2025 12:28:25 -0700 Subject: [PATCH 3/3] Fix path for module --- socketdev/fullscans/__init__.py | 2 +- socketdev/purl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/socketdev/fullscans/__init__.py b/socketdev/fullscans/__init__.py index 1870035..939b762 100644 --- a/socketdev/fullscans/__init__.py +++ b/socketdev/fullscans/__init__.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union from dataclasses import dataclass, asdict, field import urllib.parse -from core.dedupe import Dedupe +from ..core.dedupe import Dedupe from ..utils import IntegrationType, Utils log = logging.getLogger("socketdev") diff --git a/socketdev/purl/__init__.py b/socketdev/purl/__init__.py index 3fb53fd..0d7a570 100644 --- a/socketdev/purl/__init__.py +++ b/socketdev/purl/__init__.py @@ -1,7 +1,7 @@ import json import urllib.parse from socketdev.log import log -from core.dedupe import Dedupe +from ..core.dedupe import Dedupe class Purl: