keywords extraction test using spacy related to #331, fixes #329

SAP · Dec 12, 2022 · 3e8efca · 3e8efca
1 parent d2d2b79
commit 3e8efca
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 179 deletions.
diff --git a/prospector/client/cli/main.py b/prospector/client/cli/main.py
@@ -42,13 +42,19 @@ def main(argv):  # noqa: C901
         logger.setLevel(config.log_level)
         logger.info(f"Global log level set to {get_level(string=True)}")
 
-        if config.cve_id is None:
-            logger.error("No vulnerability id was specified. Cannot proceed.")
-            console.print(
-                "No vulnerability id was specified. Cannot proceed.",
-                status=MessageStatus.ERROR,
-            )
-            return
+        nvd_rest_endpoint = configuration.get("nvd_rest_endpoint", "")  # default ???
+
+        backend = args.backend or configuration.get("backend", DEFAULT_BACKEND)  # ???
+
+        use_backend = args.use_backend
+
+        if args.ping:
+            return ping_backend(backend, log.config.level < logging.INFO)
+
+        vulnerability_id = args.vulnerability_id
+        repository_url = args.repository
+        vuln_descr = args.descr
+        filter_extensions = args.filter_extensions.split(",")
 
         # if args.get("ping"):
         #     return ping_backend(backend, get_level() < logging.INFO)
@@ -70,15 +76,15 @@ def main(argv):  # noqa: C901
         logger.debug(f"time-limit after: {time_limit_after}")
 
     results, advisory_record = prospector(
-        vulnerability_id=config.cve_id,
-        repository_url=config.repository,
-        publication_date=config.pub_date,
-        vuln_descr=config.description,
-        tag_interval=config.tag_interval,
-        filter_extensions=config.filter_extensions.split(","),
-        version_interval=config.version_interval,
-        modified_files=set(config.modified_files.split(",")),
-        advisory_keywords=set(config.keywords.split(",")),
+        vulnerability_id=vulnerability_id,
+        repository_url=repository_url,
+        publication_date=publication_date,
+        vuln_descr=vuln_descr,
+        tag_interval=tag_interval,
+        filter_extensions=filter_extensions,
+        version_interval=version_interval,
+        modified_files=set(modified_files),
+        advisory_keywords=set(advisory_keywords),
         time_limit_before=time_limit_before,
         time_limit_after=time_limit_after,
         use_nvd=config.use_nvd,

diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py
@@ -72,19 +72,38 @@ def prospector(  # noqa: C901
             filter_extensions,
         )
 
-    # obtain a repository object
-    repository = Git(repository_url, git_cache)
-
-    # retrieve of commit candidates
-    candidates = get_candidates(
-        advisory_record,
-        repository,
-        tag_interval,
-        version_interval,
-        time_limit_before,
-        time_limit_after,
-        limit_candidates,
-    )
+    with ConsoleWriter("Obtaining initial set of candidates") as writer:
+
+        # obtain a repository object
+        repository = Git(repository_url, git_cache)
+
+        # retrieve of commit candidates
+        candidates = get_candidates(
+            advisory_record,
+            repository,
+            tag_interval,
+            version_interval,
+            time_limit_before,
+            time_limit_after,
+            filter_extensions[0],
+        )
+        _logger.debug(f"Collected {len(candidates)} candidates")
+
+        if len(candidates) > limit_candidates:
+            _logger.error(
+                "Number of candidates exceeds %d, aborting." % limit_candidates
+            )
+            _logger.error(
+                "Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory."
+            )
+            writer.print(
+                f"Found {len(candidates)} candidates, too many to proceed.",
+                status=MessageStatus.ERROR,
+            )
+            writer.print("Please try running the tool again.")
+            sys.exit(-1)
+
+        writer.print(f"Found {len(candidates)} candidates")
 
     with ExecutionTimer(
         core_statistics.sub_collection("commit preprocessing")

diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py
@@ -4,12 +4,19 @@
 from urllib.parse import urlparse
 
 import requests
-from dateutil.parser import isoparse
+from pydantic import BaseModel, Field
+import spacy
 
 from log.logger import get_level, logger, pretty_log
 from util.http import fetch_url
 
-from .nlp import extract_affected_filenames, extract_products, extract_words_from_text
+from .nlp import (
+    extract_affected_filenames,
+    extract_nouns_from_text,
+    extract_products,
+    extract_special_terms,
+    extract_versions,
+)
 
 ALLOWED_SITES = [
     "github.com",
@@ -38,58 +45,58 @@
 
 
 LOCAL_NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
-NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0"
-NVD_API_KEY = os.getenv("NVD_API_KEY", "")
-
-
-class AdvisoryRecord:
-    """The advisory record captures all relevant information on the vulnerability advisory"""
-
-    def __init__(
-        self,
-        cve_id: str,
-        description: str = "",
-        published_timestamp: int = 0,
-        last_modified_timestamp: int = 0,
-        references: List[str] = None,
-        references_content: List[str] = None,
-        affected_products: List[str] = None,
-        versions: List[Tuple[str, str]] = None,
-        files: Set[str] = None,
-        keywords: Set[str] = None,
-    ):
-        self.cve_id = cve_id
-        self.description = description
-        self.published_timestamp = published_timestamp
-        self.last_modified_timestamp = last_modified_timestamp
-        self.references = references or list()
-        self.references_content = references_content or list()
-        self.affected_products = affected_products or list()
-        self.versions = versions or list()
-        self.files = files or set()
-        self.keywords = keywords or set()
+NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0?cveId="
+
+
+# TODO: refactor and clean
+class AdvisoryRecord(BaseModel):
+    """
+    The advisory record captures all relevant information on the vulnerability advisory
+    """
+
+    vulnerability_id: str
+    repository_url: str = ""
+    published_timestamp: int = 0
+    last_modified_timestamp: int = 0
+    references: List[str] = Field(default_factory=list)
+    references_content: List[str] = Field(default_factory=list)
+    affected_products: List[str] = Field(default_factory=list)
+    description: Optional[str] = ""
+    preprocessed_vulnerability_description: str = ""
+    relevant_tags: List[str] = None
+    versions: List[str] = Field(default_factory=list)
+    from_nvd: bool = False
+    nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
+    paths: Set[str] = Field(default_factory=set)
+    keywords: Set[str] = Field(default_factory=set)
+
+    # def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
+    #     self.vulnerability_id = vulnerability_id
+    #     self.repository_url = repository_url
+    #     self.from_nvd = from_nvd
+    #     self.nvd_rest_endpoint = nvd_rest_endpoint
 
     def analyze(
-        self,
-        fetch_references: bool = False,
+        self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
     ):
         self.versions = [
             version for version in self.versions if version[0] != version[1]
         ]
         # self.versions.extend(extract_versions(self.description))
         # self.versions = list(set(self.versions))
 
-        self.affected_products.extend(extract_products(self.description))
-        self.affected_products = list(set(self.affected_products))
-
-        # TODO: this could be done on the words extracted from the description
-        self.files.update(extract_affected_filenames(self.description))
-
-        self.keywords.update(extract_words_from_text(self.description))
+        self.versions = union_of(self.versions, extract_versions(self.description))
+        self.affected_products = union_of(
+            self.affected_products, extract_products(self.description)
+        )
+        # TODO: use a set where possible to speed up the rule application time
+        self.paths.update(
+            extract_affected_filenames(self.description, relevant_extensions)  # TODO: this could be done on the words extracted from the description
+        )
 
-        logger.debug("References: " + str(self.references))
-        # TODO: misses something because of subdomains not considered e.g. lists.apache.org
+        self.keywords.update(extract_nouns_from_text(self.description))
 
+        _logger.debug("References: " + str(self.references))
         self.references = [
             r
             for r in self.references
@@ -156,15 +163,16 @@ def get_from_local(vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOIN
 
 
 def build_advisory_record(
-    cve_id: str,
-    description: str = None,
-    nvd_rest_endpoint: str = None,
-    fetch_references: bool = False,
-    use_nvd: bool = True,
-    publication_date: str = None,
-    advisory_keywords: Set[str] = None,
-    modified_files: Set[str] = None,
-    filter_extensions: List[str] = None,
+    vulnerability_id: str,
+    repository_url: str,
+    vuln_descr: str,
+    nvd_rest_endpoint: str,
+    fetch_references: bool,
+    use_nvd: bool,
+    publication_date: str,
+    advisory_keywords: Set[str],
+    modified_files: Set[str],
+    filter_extensions: List[str],
 ) -> AdvisoryRecord:
 
     advisory_record = AdvisoryRecord(
@@ -187,11 +195,11 @@ def build_advisory_record(
             isoparse(publication_date).timestamp()
         )
 
-    if advisory_keywords and len(advisory_keywords) > 0:
+    if len(advisory_keywords) > 0:
         advisory_record.keywords.update(advisory_keywords)
 
-    if modified_files and len(modified_files) > 0:
-        advisory_record.files.update(modified_files)
+    if len(modified_files) > 0:
+        advisory_record.paths.update(modified_files)
 
     logger.debug(f"{advisory_record.keywords=}")
     logger.debug(f"{advisory_record.files=}")

diff --git a/prospector/datamodel/nlp.py b/prospector/datamodel/nlp.py
@@ -1,19 +1,17 @@
 import os
 import re
-from typing import Dict, List, Set
-
-import requests
-
-# from util.http import extract_from_webpage, fetch_url, get_from_xml
-from spacy import load
-
+from typing import Dict, List, Set, Tuple
+from util.http import extract_from_webpage, fetch_url
+from spacy import Language, load
 from datamodel.constants import RELEVANT_EXTENSIONS
 from util.http import get_from_xml
 
 JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 
 
+nlp = load("en_core_web_sm")
+
 nlp = load("en_core_web_sm")
 
 
@@ -37,25 +35,20 @@ def extract_special_terms(description: str) -> Set[str]:
     return tuple(result)
 
 
-def extract_words_from_text(text: str) -> Set[str]:
-    """Use spacy to extract "relevant words" from text"""
-    # Lemmatization
-    return set(
-        [
-            token.lemma_.casefold()
-            for token in nlp(text)
-            if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
-        ]
-    )
+def extract_nouns_from_text(text: str) -> List[str]:
+    """Use spacy to extract nouns from text"""
+    return [
+        token.text
+        for token in nlp(text)
+        if token.pos_ == "NOUN" and len(token.text) > 3
+    ]
 
 
-def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
+def extract_similar_words(
+    adv_words: Set[str], commit_msg: str, blocklist: Set[str]
+) -> List[str]:
     """Extract nouns from commit message that appears in the advisory text"""
-    commit_words = {
-        word for word in extract_words_from_text(commit_msg) if word not in exclude
-    }
-    return commit_words.intersection(adv_words)
-    # return [word for word in extract_words_from_text(commit_msg) if word in adv_words]
+    return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]
 
 
 def extract_versions(text: str) -> List[str]:

diff --git a/prospector/datamodel/nlp_test.py b/prospector/datamodel/nlp_test.py
@@ -1,18 +1,22 @@
 from .nlp import (
     extract_affected_filenames,
-    extract_ghissue_references,
-    extract_jira_references,
-    find_similar_words,
+    extract_similar_words,
+    extract_special_terms,
 )
 
 
 def test_extract_similar_words():
-    commit_msg = "Is this an advisory message?"
-    adv_text = "This is an advisory description message"
-    similarities = find_similar_words(
-        set(adv_text.casefold().split()), commit_msg, "simola"
-    )
-    assert similarities.pop() == "message"
+    commit_msg = "This is a commit message"
+    adv_text = "This is an advisory text"
+    similarities = extract_similar_words(adv_text, commit_msg, set())
+    assert similarities.sort() == ["This"].sort()
+
+
+@pytest.mark.skip(reason="Outdated")
+def test_adv_record_path_extraction_no_real_paths():
+    result = extract_affected_filenames(ADVISORY_TEXT_1)
+
+    assert result == []
 
 
 ADVISORY_TEXT_1 = """CXF supports (via JwtRequestCodeFilter) passing OAuth 2 parameters via a JWT token as opposed to query parameters (see: The OAuth 2.0 Authorization Framework: JWT Secured Authorization Request (JAR)). Instead of sending a JWT token as a "request" parameter, the spec also supports specifying a URI from which to retrieve a JWT token from via the "request_uri" parameter. CXF was not validating the "request_uri" parameter (apart from ensuring it uses "https) and was making a REST request to the parameter in the request to retrieve a token. This means that CXF was vulnerable to DDos attacks on the authorization server, as specified in section 10.4.1 of the spec. This issue affects Apache CXF versions prior to 3.4.3; Apache CXF versions prior to 3.3.10."""

diff --git a/prospector/rules/helpers.py b/prospector/rules/helpers.py
@@ -1,6 +1,8 @@
 from typing import Dict, Set
 
 import pandas
+from spacy import load
+import spacy
 
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit