keywords extraction test using spacy related to #331, fixes #329

SAP · Oct 10, 2022 · 15e42f8 · 15e42f8
1 parent bb6b693
commit 15e42f8
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 71 deletions.
diff --git a/prospector/client/cli/main.py b/prospector/client/cli/main.py
@@ -237,7 +237,7 @@ def main(argv):  # noqa: C901
         vulnerability_id = args.vulnerability_id
         repository_url = args.repository
         vuln_descr = args.descr
-        filter_extensions = args.filter_extensions
+        filter_extensions = args.filter_extensions.split(",")
 
         # if no backend the filters on the advisory do not work
         use_nvd = False
@@ -291,8 +291,8 @@ def main(argv):  # noqa: C901
         tag_interval=tag_interval,
         filter_extensions=filter_extensions,
         version_interval=version_interval,
-        modified_files=modified_files,
-        advisory_keywords=advisory_keywords,
+        modified_files=set(modified_files),
+        advisory_keywords=set(advisory_keywords),
         time_limit_before=time_limit_before,
         time_limit_after=time_limit_after,
         use_nvd=use_nvd,

diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py
@@ -44,7 +44,7 @@ def prospector(  # noqa: C901
     publication_date: str = "",
     vuln_descr: str = "",
     tag_interval: str = "",
-    filter_extensions: str = "",
+    filter_extensions: List[str] = [],
     version_interval: str = "",
     modified_files: Set[str] = set(),
     advisory_keywords: Set[str] = set(),
@@ -90,7 +90,7 @@ def prospector(  # noqa: C901
             version_interval,
             time_limit_before,
             time_limit_after,
-            filter_extensions,
+            filter_extensions[0],
         )
         _logger.debug(f"Collected {len(candidates)} candidates")
 

diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py
@@ -8,13 +8,15 @@
 
 import requests
 from pydantic import BaseModel, Field
+import spacy
 
 import log.util
 from util.collection import union_of
 from util.http import fetch_url
 
 from .nlp import (
     extract_affected_filenames,
+    extract_nouns_from_text,
     extract_products,
     extract_special_terms,
     extract_versions,
@@ -74,14 +76,15 @@ class AdvisoryRecord(BaseModel):
     nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
     paths: Set[str] = Field(default_factory=set)
     keywords: Set[str] = Field(default_factory=set)
+
     # def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
     #     self.vulnerability_id = vulnerability_id
     #     self.repository_url = repository_url
     #     self.from_nvd = from_nvd
     #     self.nvd_rest_endpoint = nvd_rest_endpoint
 
     def analyze(
-        self, use_nvd: bool = False, fetch_references=False, relevant_extensions=[]
+        self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
     ):
         self.from_nvd = use_nvd
         if self.from_nvd:
@@ -91,17 +94,12 @@ def analyze(
         self.affected_products = union_of(
             self.affected_products, extract_products(self.description)
         )
-
         # TODO: use a set where possible to speed up the rule application time
         self.paths.update(
-            extract_affected_filenames(self.description, relevant_extensions)
+            extract_affected_filenames(self.description, relevant_extensions)  # TODO: this could be done on the words extracted from the description
         )
-        # self.paths = union_of(
-        #     self.paths,
-        #     extract_affected_filenames(self.description, relevant_extensions),
-        # )
-        self.keywords.update(extract_special_terms(self.description))
-        # self.keywords = union_of(self.keywords, extract_special_terms(self.description))
+
+        self.keywords.update(extract_nouns_from_text(self.description))
 
         _logger.debug("References: " + str(self.references))
         self.references = [
@@ -202,10 +200,10 @@ def build_advisory_record(
     nvd_rest_endpoint: str,
     fetch_references: bool,
     use_nvd: bool,
-    publication_date,
-    advisory_keywords,
-    modified_files,
-    filter_extensions,
+    publication_date: str,
+    advisory_keywords: Set[str],
+    modified_files: Set[str],
+    filter_extensions: List[str],
 ) -> AdvisoryRecord:
 
     advisory_record = AdvisoryRecord(
@@ -230,12 +228,10 @@ def build_advisory_record(
         )
 
     if len(advisory_keywords) > 0:
-        advisory_record.keywords += tuple(advisory_keywords)
-        # drop duplicates
-        advisory_record.keywords = list(set(advisory_record.keywords))
+        advisory_record.keywords.update(advisory_keywords)
 
     if len(modified_files) > 0:
-        advisory_record.paths += modified_files
+        advisory_record.paths.update(modified_files)
 
     _logger.debug(f"{advisory_record.keywords=}")
     _logger.debug(f"{advisory_record.paths=}")

diff --git a/prospector/datamodel/nlp.py b/prospector/datamodel/nlp.py
@@ -1,20 +1,21 @@
 import re
 from typing import Dict, List, Set, Tuple
 from util.http import extract_from_webpage, fetch_url
-
-
+from spacy import Language, load
 from datamodel.constants import RELEVANT_EXTENSIONS
 
 JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
 
+nlp = load("en_core_web_sm")
+
 
-def extract_special_terms(description: str) -> Tuple[str, ...]:
+def extract_special_terms(description: str) -> Set[str]:
     """
     Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
     These are usually code fragments and names of code entities, or paths.
     """
 
-    return ()
+    return set()
     # TODO replace this with NLP implementation
     # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
     # noinspection PyUnreachableCode
@@ -28,15 +29,20 @@ def extract_special_terms(description: str) -> Tuple[str, ...]:
     return tuple(result)
 
 
+def extract_nouns_from_text(text: str) -> List[str]:
+    """Use spacy to extract nouns from text"""
+    return [
+        token.text
+        for token in nlp(text)
+        if token.pos_ == "NOUN" and len(token.text) > 3
+    ]
+
+
 def extract_similar_words(
-    adv_text: str, commit_msg: str, blocklist: Set[str]
+    adv_words: Set[str], commit_msg: str, blocklist: Set[str]
 ) -> List[str]:
-    output = set()
-    for word in commit_msg.split():
-        if word in adv_text and word.casefold() not in blocklist:
-            output.add(word)
-
-    return list(output)
+    """Extract nouns from commit message that appears in the advisory text"""
+    return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]
 
 
 def extract_versions(text: str) -> List[str]:

diff --git a/prospector/datamodel/test_nlp.py b/prospector/datamodel/test_nlp.py
@@ -4,42 +4,16 @@
     extract_cve_references,
     extract_jira_references,
     extract_affected_filenames,
+    extract_similar_words,
     extract_special_terms,
 )
 
 
-def test_extract_special_terms():
-    description = (
-        "org.apache.http.conn.ssl.AbstractVerifier in Apache HttpComponents HttpClient "
-        "before 4.3.5 and HttpAsyncClient before 4.0.2 does not properly verify that the "
-        "server hostname matches a domain name in the subject's Common Name (CN) or "
-        "subjectAltName field of the X.509 certificate, which allows man-in-the-middle "
-        'attackers to spoof SSL servers via a "CN=" string in a field in the distinguished '
-        'name (DN) of a certificate, as demonstrated by the "foo,CN=www.apache.org" string in '
-        "the O field."
-    )
-
-    terms = extract_special_terms(description)
-
-    # TODO replace when NLP implementation is done
-    # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
-    assert terms == () or terms == (
-        "org.apache.http.conn.ssl.AbstractVerifier",
-        "HttpComponents",
-        "HttpClient",
-        "4.3.5",
-        "HttpAsyncClient",
-        "4.0.2",
-        "subject's",
-        "(CN)",
-        "subjectAltName",
-        "X.509",
-        "man-in-the-middle",
-        "SSL",
-        '"CN="',
-        "(DN)",
-        '"foo,CN=www.apache.org"',
-    )
+def test_extract_similar_words():
+    commit_msg = "This is a commit message"
+    adv_text = "This is an advisory text"
+    similarities = extract_similar_words(adv_text, commit_msg, set())
+    assert similarities.sort() == ["This"].sort()
 
 
 @pytest.mark.skip(reason="Outdated")

diff --git a/prospector/rules/helpers.py b/prospector/rules/helpers.py
@@ -1,6 +1,8 @@
 from typing import Dict, Set
 
 import pandas
+from spacy import load
+import spacy
 
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit

diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py
@@ -5,6 +5,7 @@
 
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit
+from datamodel.nlp import extract_similar_words
 from rules.helpers import (
     extract_commit_mentioned_in_linked_pages,
     extract_references_vuln_id,
@@ -176,16 +177,16 @@ def apply_rule_changes_relevant_file(
     return None
 
 
-# TODO: this is empty now
 def apply_rule_adv_keywords_in_msg(
     candidate: Commit, advisory_record: AdvisoryRecord
 ) -> str:
     """Matches commits whose message contain any of the special "code tokens" extracted from the advisory."""
     explanation_template = "The commit message includes the following keywords: {}"
 
-    matching_keywords = set(
-        [kw for kw in advisory_record.keywords if kw in candidate.message]
-    )
+    matching_keywords = set(extract_similar_words(advisory_record.keywords, candidate.message, set()))
+    # matching_keywords = set(
+    #     [kw for kw in advisory_record.keywords if kw in candidate.message]
+    # )
 
     if len(matching_keywords):
         return explanation_template.format(", ".join(matching_keywords))
@@ -198,7 +199,7 @@ def apply_rule_adv_keywords_in_diff(
     candidate: Commit, advisory_record: AdvisoryRecord
 ) -> str:
     """Matches commits whose diff contain any of the special "code tokens" extracted from the advisory."""
-
+    return None
     # FIXME: this is hardcoded, read it from an "config" object passed to the rule function
     skip_tokens = ["IO"]
 
@@ -394,7 +395,7 @@ def apply_rule_small_commit(candidate: Commit, advisory_record: AdvisoryRecord)
 RULES = {
     "CVE_ID_IN_COMMIT_MSG": Rule(apply_rule_cve_id_in_msg, 10),
     "TOKENS_IN_DIFF": Rule(apply_rule_adv_keywords_in_diff, 7),
-    "TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 10),
+    "TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 5),
     "TOKENS_IN_MODIFIED_PATHS": Rule(apply_rule_adv_keywords_in_paths, 10),
     "SEC_KEYWORD_IN_COMMIT_MSG": Rule(apply_rule_security_keyword_in_msg, 5),
     "GH_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_ghissue, 2),