Skip to content

Commit

Permalink
keywords extraction test using spacy related to #331, fixes #329
Browse files Browse the repository at this point in the history
  • Loading branch information
sacca97 authored and copernico committed Oct 10, 2022
1 parent bb6b693 commit 15e42f8
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 71 deletions.
6 changes: 3 additions & 3 deletions prospector/client/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def main(argv): # noqa: C901
vulnerability_id = args.vulnerability_id
repository_url = args.repository
vuln_descr = args.descr
filter_extensions = args.filter_extensions
filter_extensions = args.filter_extensions.split(",")

# if no backend the filters on the advisory do not work
use_nvd = False
Expand Down Expand Up @@ -291,8 +291,8 @@ def main(argv): # noqa: C901
tag_interval=tag_interval,
filter_extensions=filter_extensions,
version_interval=version_interval,
modified_files=modified_files,
advisory_keywords=advisory_keywords,
modified_files=set(modified_files),
advisory_keywords=set(advisory_keywords),
time_limit_before=time_limit_before,
time_limit_after=time_limit_after,
use_nvd=use_nvd,
Expand Down
4 changes: 2 additions & 2 deletions prospector/client/cli/prospector_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def prospector( # noqa: C901
publication_date: str = "",
vuln_descr: str = "",
tag_interval: str = "",
filter_extensions: str = "",
filter_extensions: List[str] = [],
version_interval: str = "",
modified_files: Set[str] = set(),
advisory_keywords: Set[str] = set(),
Expand Down Expand Up @@ -90,7 +90,7 @@ def prospector( # noqa: C901
version_interval,
time_limit_before,
time_limit_after,
filter_extensions,
filter_extensions[0],
)
_logger.debug(f"Collected {len(candidates)} candidates")

Expand Down
30 changes: 13 additions & 17 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@

import requests
from pydantic import BaseModel, Field
import spacy

import log.util
from util.collection import union_of
from util.http import fetch_url

from .nlp import (
extract_affected_filenames,
extract_nouns_from_text,
extract_products,
extract_special_terms,
extract_versions,
Expand Down Expand Up @@ -74,14 +76,15 @@ class AdvisoryRecord(BaseModel):
nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
paths: Set[str] = Field(default_factory=set)
keywords: Set[str] = Field(default_factory=set)

# def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
# self.vulnerability_id = vulnerability_id
# self.repository_url = repository_url
# self.from_nvd = from_nvd
# self.nvd_rest_endpoint = nvd_rest_endpoint

def analyze(
self, use_nvd: bool = False, fetch_references=False, relevant_extensions=[]
self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
):
self.from_nvd = use_nvd
if self.from_nvd:
Expand All @@ -91,17 +94,12 @@ def analyze(
self.affected_products = union_of(
self.affected_products, extract_products(self.description)
)

# TODO: use a set where possible to speed up the rule application time
self.paths.update(
extract_affected_filenames(self.description, relevant_extensions)
extract_affected_filenames(self.description, relevant_extensions) # TODO: this could be done on the words extracted from the description
)
# self.paths = union_of(
# self.paths,
# extract_affected_filenames(self.description, relevant_extensions),
# )
self.keywords.update(extract_special_terms(self.description))
# self.keywords = union_of(self.keywords, extract_special_terms(self.description))

self.keywords.update(extract_nouns_from_text(self.description))

_logger.debug("References: " + str(self.references))
self.references = [
Expand Down Expand Up @@ -202,10 +200,10 @@ def build_advisory_record(
nvd_rest_endpoint: str,
fetch_references: bool,
use_nvd: bool,
publication_date,
advisory_keywords,
modified_files,
filter_extensions,
publication_date: str,
advisory_keywords: Set[str],
modified_files: Set[str],
filter_extensions: List[str],
) -> AdvisoryRecord:

advisory_record = AdvisoryRecord(
Expand All @@ -230,12 +228,10 @@ def build_advisory_record(
)

if len(advisory_keywords) > 0:
advisory_record.keywords += tuple(advisory_keywords)
# drop duplicates
advisory_record.keywords = list(set(advisory_record.keywords))
advisory_record.keywords.update(advisory_keywords)

if len(modified_files) > 0:
advisory_record.paths += modified_files
advisory_record.paths.update(modified_files)

_logger.debug(f"{advisory_record.keywords=}")
_logger.debug(f"{advisory_record.paths=}")
Expand Down
28 changes: 17 additions & 11 deletions prospector/datamodel/nlp.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import re
from typing import Dict, List, Set, Tuple
from util.http import extract_from_webpage, fetch_url


from spacy import Language, load
from datamodel.constants import RELEVANT_EXTENSIONS

JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"

nlp = load("en_core_web_sm")


def extract_special_terms(description: str) -> Tuple[str, ...]:
def extract_special_terms(description: str) -> Set[str]:
"""
Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
These are usually code fragments and names of code entities, or paths.
"""

return ()
return set()
# TODO replace this with NLP implementation
# see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
# noinspection PyUnreachableCode
Expand All @@ -28,15 +29,20 @@ def extract_special_terms(description: str) -> Tuple[str, ...]:
return tuple(result)


def extract_nouns_from_text(text: str) -> List[str]:
"""Use spacy to extract nouns from text"""
return [
token.text
for token in nlp(text)
if token.pos_ == "NOUN" and len(token.text) > 3
]


def extract_similar_words(
adv_text: str, commit_msg: str, blocklist: Set[str]
adv_words: Set[str], commit_msg: str, blocklist: Set[str]
) -> List[str]:
output = set()
for word in commit_msg.split():
if word in adv_text and word.casefold() not in blocklist:
output.add(word)

return list(output)
"""Extract nouns from commit message that appears in the advisory text"""
return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]


def extract_versions(text: str) -> List[str]:
Expand Down
38 changes: 6 additions & 32 deletions prospector/datamodel/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,16 @@
extract_cve_references,
extract_jira_references,
extract_affected_filenames,
extract_similar_words,
extract_special_terms,
)


def test_extract_special_terms():
description = (
"org.apache.http.conn.ssl.AbstractVerifier in Apache HttpComponents HttpClient "
"before 4.3.5 and HttpAsyncClient before 4.0.2 does not properly verify that the "
"server hostname matches a domain name in the subject's Common Name (CN) or "
"subjectAltName field of the X.509 certificate, which allows man-in-the-middle "
'attackers to spoof SSL servers via a "CN=" string in a field in the distinguished '
'name (DN) of a certificate, as demonstrated by the "foo,CN=www.apache.org" string in '
"the O field."
)

terms = extract_special_terms(description)

# TODO replace when NLP implementation is done
# see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
assert terms == () or terms == (
"org.apache.http.conn.ssl.AbstractVerifier",
"HttpComponents",
"HttpClient",
"4.3.5",
"HttpAsyncClient",
"4.0.2",
"subject's",
"(CN)",
"subjectAltName",
"X.509",
"man-in-the-middle",
"SSL",
'"CN="',
"(DN)",
'"foo,CN=www.apache.org"',
)
def test_extract_similar_words():
commit_msg = "This is a commit message"
adv_text = "This is an advisory text"
similarities = extract_similar_words(adv_text, commit_msg, set())
assert similarities.sort() == ["This"].sort()


@pytest.mark.skip(reason="Outdated")
Expand Down
2 changes: 2 additions & 0 deletions prospector/rules/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Dict, Set

import pandas
from spacy import load
import spacy

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
Expand Down
13 changes: 7 additions & 6 deletions prospector/rules/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
from datamodel.nlp import extract_similar_words
from rules.helpers import (
extract_commit_mentioned_in_linked_pages,
extract_references_vuln_id,
Expand Down Expand Up @@ -176,16 +177,16 @@ def apply_rule_changes_relevant_file(
return None


# TODO: this is empty now
def apply_rule_adv_keywords_in_msg(
candidate: Commit, advisory_record: AdvisoryRecord
) -> str:
"""Matches commits whose message contain any of the special "code tokens" extracted from the advisory."""
explanation_template = "The commit message includes the following keywords: {}"

matching_keywords = set(
[kw for kw in advisory_record.keywords if kw in candidate.message]
)
matching_keywords = set(extract_similar_words(advisory_record.keywords, candidate.message, set()))
# matching_keywords = set(
# [kw for kw in advisory_record.keywords if kw in candidate.message]
# )

if len(matching_keywords):
return explanation_template.format(", ".join(matching_keywords))
Expand All @@ -198,7 +199,7 @@ def apply_rule_adv_keywords_in_diff(
candidate: Commit, advisory_record: AdvisoryRecord
) -> str:
"""Matches commits whose diff contain any of the special "code tokens" extracted from the advisory."""

return None
# FIXME: this is hardcoded, read it from an "config" object passed to the rule function
skip_tokens = ["IO"]

Expand Down Expand Up @@ -394,7 +395,7 @@ def apply_rule_small_commit(candidate: Commit, advisory_record: AdvisoryRecord)
RULES = {
"CVE_ID_IN_COMMIT_MSG": Rule(apply_rule_cve_id_in_msg, 10),
"TOKENS_IN_DIFF": Rule(apply_rule_adv_keywords_in_diff, 7),
"TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 10),
"TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 5),
"TOKENS_IN_MODIFIED_PATHS": Rule(apply_rule_adv_keywords_in_paths, 10),
"SEC_KEYWORD_IN_COMMIT_MSG": Rule(apply_rule_security_keyword_in_msg, 5),
"GH_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_ghissue, 2),
Expand Down

0 comments on commit 15e42f8

Please sign in to comment.