keywords extraction test using spacy related to #331, fixes #329

SAP · Nov 7, 2022 · c913a1c · c913a1c
1 parent 1868a96
commit c913a1c
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 125 deletions.
diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py
@@ -75,19 +75,38 @@ def prospector(  # noqa: C901
             filter_extensions,
         )
 
-    # obtain a repository object
-    repository = Git(repository_url, git_cache)
-
-    # retrieve of commit candidates
-    candidates = get_candidates(
-        advisory_record,
-        repository,
-        tag_interval,
-        version_interval,
-        time_limit_before,
-        time_limit_after,
-        limit_candidates,
-    )
+    with ConsoleWriter("Obtaining initial set of candidates") as writer:
+
+        # obtain a repository object
+        repository = Git(repository_url, git_cache)
+
+        # retrieve of commit candidates
+        candidates = get_candidates(
+            advisory_record,
+            repository,
+            tag_interval,
+            version_interval,
+            time_limit_before,
+            time_limit_after,
+            filter_extensions[0],
+        )
+        _logger.debug(f"Collected {len(candidates)} candidates")
+
+        if len(candidates) > limit_candidates:
+            _logger.error(
+                "Number of candidates exceeds %d, aborting." % limit_candidates
+            )
+            _logger.error(
+                "Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory."
+            )
+            writer.print(
+                f"Found {len(candidates)} candidates, too many to proceed.",
+                status=MessageStatus.ERROR,
+            )
+            writer.print("Please try running the tool again.")
+            sys.exit(-1)
+
+        writer.print(f"Found {len(candidates)} candidates")
 
     with ExecutionTimer(
         core_statistics.sub_collection("commit preprocessing")

diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py
@@ -8,13 +8,14 @@
 
 import requests
 from pydantic import BaseModel, Field
+import spacy
 
 from log.logger import logger, pretty_log, get_level
 from util.http import fetch_url
 
 from .nlp import (
     extract_affected_filenames,
-    extract_words_from_text,
+    extract_nouns_from_text,
     extract_products,
     extract_versions,
 )
@@ -81,33 +82,22 @@ class AdvisoryRecord(BaseModel):
     #     self.nvd_rest_endpoint = nvd_rest_endpoint
 
     def analyze(
-        self,
-        use_nvd: bool = False,
-        fetch_references: bool = False,
-        relevant_extensions: List[str] = [],
+        self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
     ):
         self.from_nvd = use_nvd
         if self.from_nvd:
             self.get_advisory(self.vulnerability_id, self.nvd_rest_endpoint)
 
-        # Union of also removed duplicates...
-        self.versions.extend(extract_versions(self.description))
-        self.versions = list(set(self.versions))
-        # = union_of(self.versions, extract_versions(self.description))
-        self.affected_products.extend(extract_products(self.description))
-        self.affected_products = list(set(self.affected_products))
-
-        #  = union_of(
-        #     self.affected_products, extract_products(self.description)
-        # )
+        self.versions = union_of(self.versions, extract_versions(self.description))
+        self.affected_products = union_of(
+            self.affected_products, extract_products(self.description)
+        )
         # TODO: use a set where possible to speed up the rule application time
-        self.files.update(
-            extract_affected_filenames(self.description)
-            # TODO: this could be done on the words extracted from the description
+        self.paths.update(
+            extract_affected_filenames(self.description, relevant_extensions)  # TODO: this could be done on the words extracted from the description
         )
-        # print(self.files)
 
-        self.keywords.update(extract_words_from_text(self.description))
+        self.keywords.update(extract_nouns_from_text(self.description))
 
         logger.debug("References: " + str(self.references))
         self.references = [
@@ -237,7 +227,7 @@ def build_advisory_record(
         advisory_record.keywords.update(advisory_keywords)
 
     if len(modified_files) > 0:
-        advisory_record.files.update(modified_files)
+        advisory_record.paths.update(modified_files)
 
     logger.debug(f"{advisory_record.keywords=}")
     logger.debug(f"{advisory_record.files=}")

diff --git a/prospector/datamodel/nlp.py b/prospector/datamodel/nlp.py
@@ -1,10 +1,8 @@
 import os
 import re
-from typing import Dict, List, Set
-import requests
-
-# from util.http import extract_from_webpage, fetch_url, get_from_xml
-from spacy import load
+from typing import Dict, List, Set, Tuple
+from util.http import extract_from_webpage, fetch_url
+from spacy import Language, load
 from datamodel.constants import RELEVANT_EXTENSIONS
 from util.http import extract_from_webpage, get_from_xml
 
@@ -35,25 +33,20 @@ def extract_special_terms(description: str) -> Set[str]:
     return tuple(result)
 
 
-def extract_words_from_text(text: str) -> Set[str]:
-    """Use spacy to extract "relevant words" from text"""
-    # Lemmatization
-    return set(
-        [
-            token.lemma_.casefold()
-            for token in nlp(text)
-            if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
-        ]
-    )
+def extract_nouns_from_text(text: str) -> List[str]:
+    """Use spacy to extract nouns from text"""
+    return [
+        token.text
+        for token in nlp(text)
+        if token.pos_ == "NOUN" and len(token.text) > 3
+    ]
 
 
-def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
+def extract_similar_words(
+    adv_words: Set[str], commit_msg: str, blocklist: Set[str]
+) -> List[str]:
     """Extract nouns from commit message that appears in the advisory text"""
-    commit_words = {
-        word for word in extract_words_from_text(commit_msg) if word not in exclude
-    }
-    return commit_words.intersection(adv_words)
-    # return [word for word in extract_words_from_text(commit_msg) if word in adv_words]
+    return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]
 
 
 def extract_versions(text: str) -> List[str]:

diff --git a/prospector/datamodel/nlp_test.py b/prospector/datamodel/nlp_test.py
@@ -5,17 +5,23 @@
     extract_ghissue_references,
     extract_jira_references,
     extract_affected_filenames,
-    find_similar_words,
+    extract_similar_words,
+    extract_special_terms,
 )
 
 
 def test_extract_similar_words():
-    commit_msg = "Is this an advisory message?"
-    adv_text = "This is an advisory description message"
-    similarities = find_similar_words(
-        set(adv_text.casefold().split()), commit_msg, "simola"
-    )
-    assert similarities.pop() == "message"
+    commit_msg = "This is a commit message"
+    adv_text = "This is an advisory text"
+    similarities = extract_similar_words(adv_text, commit_msg, set())
+    assert similarities.sort() == ["This"].sort()
+
+
+@pytest.mark.skip(reason="Outdated")
+def test_adv_record_path_extraction_no_real_paths():
+    result = extract_affected_filenames(ADVISORY_TEXT_1)
+
+    assert result == []
 
 
 ADVISORY_TEXT_1 = """CXF supports (via JwtRequestCodeFilter) passing OAuth 2 parameters via a JWT token as opposed to query parameters (see: The OAuth 2.0 Authorization Framework: JWT Secured Authorization Request (JAR)). Instead of sending a JWT token as a "request" parameter, the spec also supports specifying a URI from which to retrieve a JWT token from via the "request_uri" parameter. CXF was not validating the "request_uri" parameter (apart from ensuring it uses "https) and was making a REST request to the parameter in the request to retrieve a token. This means that CXF was vulnerable to DDos attacks on the authorization server, as specified in section 10.4.1 of the spec. This issue affects Apache CXF versions prior to 3.4.3; Apache CXF versions prior to 3.3.10."""

diff --git a/prospector/rules/helpers.py b/prospector/rules/helpers.py
@@ -1,6 +1,8 @@
 from typing import Dict, Set
 
 import pandas
+from spacy import load
+import spacy
 
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit

diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py
@@ -4,7 +4,7 @@
 
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit
-from datamodel.nlp import find_similar_words
+from datamodel.nlp import extract_similar_words
 from rules.helpers import (
     extract_commit_mentioned_in_linked_pages,
     extract_security_keywords,
@@ -142,9 +142,16 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
             return True
         return False
 
+def apply_rule_adv_keywords_in_msg(
+    candidate: Commit, advisory_record: AdvisoryRecord
+) -> str:
+    """Matches commits whose message contain any of the special "code tokens" extracted from the advisory."""
+    explanation_template = "The commit message includes the following keywords: {}"
 
-class AdvKeywordsInMsg(Rule):
-    """Matches commits whose message contain any of the keywords extracted from the advisory."""
+    matching_keywords = set(extract_similar_words(advisory_record.keywords, candidate.message, set()))
+    # matching_keywords = set(
+    #     [kw for kw in advisory_record.keywords if kw in candidate.message]
+    # )
 
     def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
         matching_keywords = find_similar_words(
@@ -158,8 +165,30 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
 
 
 # TODO: with proper filename and msg search this could be deprecated ?
-class AdvKeywordsInDiffs(Rule):
-    """Matches commits whose diffs contain any of the keywords extracted from the advisory."""
+def apply_rule_adv_keywords_in_diff(
+    candidate: Commit, advisory_record: AdvisoryRecord
+) -> str:
+    """Matches commits whose diff contain any of the special "code tokens" extracted from the advisory."""
+    return None
+    # FIXME: this is hardcoded, read it from an "config" object passed to the rule function
+    skip_tokens = ["IO"]
+
+    explanation_template = "The commit diff includes the following keywords: {}"
+
+    matching_keywords = set(
+        [
+            kw
+            for kw in advisory_record.keywords
+            for diff_line in candidate.diff
+            if kw in diff_line and kw not in skip_tokens
+        ]
+    )
+
+    if len(matching_keywords):
+        return explanation_template.format(", ".join(matching_keywords))
+
+    return None
+
 
     def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
         return False
@@ -290,61 +319,38 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
 
 class SmallCommit(Rule):
     """Matches small commits (i.e., they modify a small number of contiguous lines of code)."""
-
-    def apply(self, candidate: Commit, _: AdvisoryRecord):
-        return False
-        if candidate.get_hunks() < 10:  # 10
-            self.message = (
-                f"This commit modifies only {candidate.hunks} contiguous lines of code"
-            )
-            return True
-        return False
-
-
-# TODO: implement properly
-class CommitMentionedInReference(Rule):
-    """Matches commits that are mentioned in any of the links contained in the advisory page."""
-
-    def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
-        if extract_commit_mentioned_in_linked_pages(candidate, advisory_record):
-            self.message = "A page linked in the advisory mentions this commit"
-
-            return True
-        return False
-
-
-class CommitHasTwins(Rule):
-    def apply(self, candidate: Commit, _: AdvisoryRecord) -> bool:
-        if not Rule.lsh_index.is_empty():
-            # TODO: the twin search must be done at the beginning, in the raw commits
-
-            candidate.twins = Rule.lsh_index.query(decode_minhash(candidate.minhash))
-            candidate.twins.remove(candidate.commit_id)
-        # self.lsh_index.insert(candidate.commit_id, decode_minhash(candidate.minhash))
-        if len(candidate.twins) > 0:
-            self.message = (
-                f"This commit has one or more twins: {', '.join(candidate.twins)}"
-            )
-            return True
-        return False
-
-
-RULES = [
-    CveIdInMessage("CVE_ID_IN_MESSAGE", 20),
-    CommitMentionedInAdv("COMMIT_IN_ADVISORY", 20),
-    CrossReferencedJiraLink("CROSS_REFERENCED_JIRA_LINK", 20),
-    CrossReferencedGhLink("CROSS_REFERENCED_GH_LINK", 20),
-    CommitMentionedInReference("COMMIT_IN_REFERENCE", 9),
-    CveIdInLinkedIssue("CVE_ID_IN_LINKED_ISSUE", 9),
-    ChangesRelevantFiles("CHANGES_RELEVANT_FILES", 9),
-    AdvKeywordsInDiffs("ADV_KEYWORDS_IN_DIFFS", 5),
-    AdvKeywordsInFiles("ADV_KEYWORDS_IN_FILES", 5),
-    AdvKeywordsInMsg("ADV_KEYWORDS_IN_MSG", 5),
-    SecurityKeywordsInMsg("SEC_KEYWORDS_IN_MESSAGE", 5),
-    SecurityKeywordInLinkedGhIssue("SEC_KEYWORDS_IN_LINKED_GH", 5),
-    SecurityKeywordInLinkedJiraIssue("SEC_KEYWORDS_IN_LINKED_JIRA", 5),
-    ReferencesGhIssue("GITHUB_ISSUE_IN_MESSAGE", 2),
-    ReferencesJiraIssue("JIRA_ISSUE_IN_MESSAGE", 2),
-    SmallCommit("SMALL_COMMIT", 0),
-    CommitHasTwins("COMMIT_HAS_TWINS", 5),
-]
+    return None
+    # unreachable code
+    MAX_HUNKS = 10
+    explanation_template = (
+        "This commit modifies only {} hunks (groups of contiguous lines of code)"
+    )
+
+    if candidate.hunk_count <= MAX_HUNKS:
+        return explanation_template.format(candidate.hunk_count)
+
+    return None
+
+
+RULES = {
+    "CVE_ID_IN_COMMIT_MSG": Rule(apply_rule_cve_id_in_msg, 10),
+    "TOKENS_IN_DIFF": Rule(apply_rule_adv_keywords_in_diff, 7),
+    "TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 5),
+    "TOKENS_IN_MODIFIED_PATHS": Rule(apply_rule_adv_keywords_in_paths, 10),
+    "SEC_KEYWORD_IN_COMMIT_MSG": Rule(apply_rule_security_keyword_in_msg, 5),
+    "GH_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_ghissue, 2),
+    "JIRA_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_jira_issue, 2),
+    "CHANGES_RELEVANT_FILE": Rule(apply_rule_changes_relevant_file, 8),
+    "COMMIT_IN_ADV": Rule(apply_rule_commit_mentioned_in_adv, 10),
+    "COMMIT_IN_REFERENCE": Rule(apply_rule_commit_mentioned_in_reference, 9),
+    "VULN_IN_LINKED_ISSUE": Rule(apply_rule_vuln_mentioned_in_linked_issue, 9),
+    "SEC_KEYWORD_IN_LINKED_GH": Rule(apply_rule_security_keyword_in_linked_gh, 5),
+    "SEC_KEYWORD_IN_LINKED_JIRA": Rule(apply_rule_security_keyword_in_linked_jira, 5),
+    "JIRA_ISSUE_IN_COMMIT_MSG_AND_ADV": Rule(
+        apply_rule_jira_issue_in_commit_msg_and_adv, 9
+    ),
+    "GH_ISSUE_IN_COMMIT_MSG_AND_ADV": Rule(
+        apply_rule_gh_issue_in_commit_msg_and_adv, 9
+    ),
+    "SMALL_COMMIT": Rule(apply_rule_small_commit, 0),
+}