Skip to content

Commit

Permalink
keywords extraction test using spacy related to #331, fixes #329
Browse files Browse the repository at this point in the history
  • Loading branch information
sacca97 committed Nov 7, 2022
1 parent 1868a96 commit c913a1c
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 125 deletions.
45 changes: 32 additions & 13 deletions prospector/client/cli/prospector_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,38 @@ def prospector( # noqa: C901
filter_extensions,
)

# obtain a repository object
repository = Git(repository_url, git_cache)

# retrieve of commit candidates
candidates = get_candidates(
advisory_record,
repository,
tag_interval,
version_interval,
time_limit_before,
time_limit_after,
limit_candidates,
)
with ConsoleWriter("Obtaining initial set of candidates") as writer:

# obtain a repository object
repository = Git(repository_url, git_cache)

# retrieve of commit candidates
candidates = get_candidates(
advisory_record,
repository,
tag_interval,
version_interval,
time_limit_before,
time_limit_after,
filter_extensions[0],
)
_logger.debug(f"Collected {len(candidates)} candidates")

if len(candidates) > limit_candidates:
_logger.error(
"Number of candidates exceeds %d, aborting." % limit_candidates
)
_logger.error(
"Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory."
)
writer.print(
f"Found {len(candidates)} candidates, too many to proceed.",
status=MessageStatus.ERROR,
)
writer.print("Please try running the tool again.")
sys.exit(-1)

writer.print(f"Found {len(candidates)} candidates")

with ExecutionTimer(
core_statistics.sub_collection("commit preprocessing")
Expand Down
32 changes: 11 additions & 21 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

import requests
from pydantic import BaseModel, Field
import spacy

from log.logger import logger, pretty_log, get_level
from util.http import fetch_url

from .nlp import (
extract_affected_filenames,
extract_words_from_text,
extract_nouns_from_text,
extract_products,
extract_versions,
)
Expand Down Expand Up @@ -81,33 +82,22 @@ class AdvisoryRecord(BaseModel):
# self.nvd_rest_endpoint = nvd_rest_endpoint

def analyze(
self,
use_nvd: bool = False,
fetch_references: bool = False,
relevant_extensions: List[str] = [],
self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
):
self.from_nvd = use_nvd
if self.from_nvd:
self.get_advisory(self.vulnerability_id, self.nvd_rest_endpoint)

# Union of also removed duplicates...
self.versions.extend(extract_versions(self.description))
self.versions = list(set(self.versions))
# = union_of(self.versions, extract_versions(self.description))
self.affected_products.extend(extract_products(self.description))
self.affected_products = list(set(self.affected_products))

# = union_of(
# self.affected_products, extract_products(self.description)
# )
self.versions = union_of(self.versions, extract_versions(self.description))
self.affected_products = union_of(
self.affected_products, extract_products(self.description)
)
# TODO: use a set where possible to speed up the rule application time
self.files.update(
extract_affected_filenames(self.description)
# TODO: this could be done on the words extracted from the description
self.paths.update(
extract_affected_filenames(self.description, relevant_extensions) # TODO: this could be done on the words extracted from the description
)
# print(self.files)

self.keywords.update(extract_words_from_text(self.description))
self.keywords.update(extract_nouns_from_text(self.description))

logger.debug("References: " + str(self.references))
self.references = [
Expand Down Expand Up @@ -237,7 +227,7 @@ def build_advisory_record(
advisory_record.keywords.update(advisory_keywords)

if len(modified_files) > 0:
advisory_record.files.update(modified_files)
advisory_record.paths.update(modified_files)

logger.debug(f"{advisory_record.keywords=}")
logger.debug(f"{advisory_record.files=}")
Expand Down
35 changes: 14 additions & 21 deletions prospector/datamodel/nlp.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import os
import re
from typing import Dict, List, Set
import requests

# from util.http import extract_from_webpage, fetch_url, get_from_xml
from spacy import load
from typing import Dict, List, Set, Tuple
from util.http import extract_from_webpage, fetch_url
from spacy import Language, load
from datamodel.constants import RELEVANT_EXTENSIONS
from util.http import extract_from_webpage, get_from_xml

Expand Down Expand Up @@ -35,25 +33,20 @@ def extract_special_terms(description: str) -> Set[str]:
return tuple(result)


def extract_words_from_text(text: str) -> Set[str]:
"""Use spacy to extract "relevant words" from text"""
# Lemmatization
return set(
[
token.lemma_.casefold()
for token in nlp(text)
if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
]
)
def extract_nouns_from_text(text: str) -> List[str]:
"""Use spacy to extract nouns from text"""
return [
token.text
for token in nlp(text)
if token.pos_ == "NOUN" and len(token.text) > 3
]


def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
def extract_similar_words(
adv_words: Set[str], commit_msg: str, blocklist: Set[str]
) -> List[str]:
"""Extract nouns from commit message that appears in the advisory text"""
commit_words = {
word for word in extract_words_from_text(commit_msg) if word not in exclude
}
return commit_words.intersection(adv_words)
# return [word for word in extract_words_from_text(commit_msg) if word in adv_words]
return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]


def extract_versions(text: str) -> List[str]:
Expand Down
20 changes: 13 additions & 7 deletions prospector/datamodel/nlp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,23 @@
extract_ghissue_references,
extract_jira_references,
extract_affected_filenames,
find_similar_words,
extract_similar_words,
extract_special_terms,
)


def test_extract_similar_words():
commit_msg = "Is this an advisory message?"
adv_text = "This is an advisory description message"
similarities = find_similar_words(
set(adv_text.casefold().split()), commit_msg, "simola"
)
assert similarities.pop() == "message"
commit_msg = "This is a commit message"
adv_text = "This is an advisory text"
similarities = extract_similar_words(adv_text, commit_msg, set())
assert similarities.sort() == ["This"].sort()


@pytest.mark.skip(reason="Outdated")
def test_adv_record_path_extraction_no_real_paths():
result = extract_affected_filenames(ADVISORY_TEXT_1)

assert result == []


ADVISORY_TEXT_1 = """CXF supports (via JwtRequestCodeFilter) passing OAuth 2 parameters via a JWT token as opposed to query parameters (see: The OAuth 2.0 Authorization Framework: JWT Secured Authorization Request (JAR)). Instead of sending a JWT token as a "request" parameter, the spec also supports specifying a URI from which to retrieve a JWT token from via the "request_uri" parameter. CXF was not validating the "request_uri" parameter (apart from ensuring it uses "https) and was making a REST request to the parameter in the request to retrieve a token. This means that CXF was vulnerable to DDos attacks on the authorization server, as specified in section 10.4.1 of the spec. This issue affects Apache CXF versions prior to 3.4.3; Apache CXF versions prior to 3.3.10."""
Expand Down
2 changes: 2 additions & 0 deletions prospector/rules/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Dict, Set

import pandas
from spacy import load
import spacy

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
Expand Down
132 changes: 69 additions & 63 deletions prospector/rules/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
from datamodel.nlp import find_similar_words
from datamodel.nlp import extract_similar_words
from rules.helpers import (
extract_commit_mentioned_in_linked_pages,
extract_security_keywords,
Expand Down Expand Up @@ -142,9 +142,16 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
return True
return False

def apply_rule_adv_keywords_in_msg(
candidate: Commit, advisory_record: AdvisoryRecord
) -> str:
"""Matches commits whose message contain any of the special "code tokens" extracted from the advisory."""
explanation_template = "The commit message includes the following keywords: {}"

class AdvKeywordsInMsg(Rule):
"""Matches commits whose message contain any of the keywords extracted from the advisory."""
matching_keywords = set(extract_similar_words(advisory_record.keywords, candidate.message, set()))
# matching_keywords = set(
# [kw for kw in advisory_record.keywords if kw in candidate.message]
# )

def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
matching_keywords = find_similar_words(
Expand All @@ -158,8 +165,30 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):


# TODO: with proper filename and msg search this could be deprecated ?
class AdvKeywordsInDiffs(Rule):
"""Matches commits whose diffs contain any of the keywords extracted from the advisory."""
def apply_rule_adv_keywords_in_diff(
candidate: Commit, advisory_record: AdvisoryRecord
) -> str:
"""Matches commits whose diff contain any of the special "code tokens" extracted from the advisory."""
return None
# FIXME: this is hardcoded, read it from an "config" object passed to the rule function
skip_tokens = ["IO"]

explanation_template = "The commit diff includes the following keywords: {}"

matching_keywords = set(
[
kw
for kw in advisory_record.keywords
for diff_line in candidate.diff
if kw in diff_line and kw not in skip_tokens
]
)

if len(matching_keywords):
return explanation_template.format(", ".join(matching_keywords))

return None


def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
return False
Expand Down Expand Up @@ -290,61 +319,38 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):

class SmallCommit(Rule):
"""Matches small commits (i.e., they modify a small number of contiguous lines of code)."""

def apply(self, candidate: Commit, _: AdvisoryRecord):
return False
if candidate.get_hunks() < 10: # 10
self.message = (
f"This commit modifies only {candidate.hunks} contiguous lines of code"
)
return True
return False


# TODO: implement properly
class CommitMentionedInReference(Rule):
"""Matches commits that are mentioned in any of the links contained in the advisory page."""

def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
if extract_commit_mentioned_in_linked_pages(candidate, advisory_record):
self.message = "A page linked in the advisory mentions this commit"

return True
return False


class CommitHasTwins(Rule):
def apply(self, candidate: Commit, _: AdvisoryRecord) -> bool:
if not Rule.lsh_index.is_empty():
# TODO: the twin search must be done at the beginning, in the raw commits

candidate.twins = Rule.lsh_index.query(decode_minhash(candidate.minhash))
candidate.twins.remove(candidate.commit_id)
# self.lsh_index.insert(candidate.commit_id, decode_minhash(candidate.minhash))
if len(candidate.twins) > 0:
self.message = (
f"This commit has one or more twins: {', '.join(candidate.twins)}"
)
return True
return False


RULES = [
CveIdInMessage("CVE_ID_IN_MESSAGE", 20),
CommitMentionedInAdv("COMMIT_IN_ADVISORY", 20),
CrossReferencedJiraLink("CROSS_REFERENCED_JIRA_LINK", 20),
CrossReferencedGhLink("CROSS_REFERENCED_GH_LINK", 20),
CommitMentionedInReference("COMMIT_IN_REFERENCE", 9),
CveIdInLinkedIssue("CVE_ID_IN_LINKED_ISSUE", 9),
ChangesRelevantFiles("CHANGES_RELEVANT_FILES", 9),
AdvKeywordsInDiffs("ADV_KEYWORDS_IN_DIFFS", 5),
AdvKeywordsInFiles("ADV_KEYWORDS_IN_FILES", 5),
AdvKeywordsInMsg("ADV_KEYWORDS_IN_MSG", 5),
SecurityKeywordsInMsg("SEC_KEYWORDS_IN_MESSAGE", 5),
SecurityKeywordInLinkedGhIssue("SEC_KEYWORDS_IN_LINKED_GH", 5),
SecurityKeywordInLinkedJiraIssue("SEC_KEYWORDS_IN_LINKED_JIRA", 5),
ReferencesGhIssue("GITHUB_ISSUE_IN_MESSAGE", 2),
ReferencesJiraIssue("JIRA_ISSUE_IN_MESSAGE", 2),
SmallCommit("SMALL_COMMIT", 0),
CommitHasTwins("COMMIT_HAS_TWINS", 5),
]
return None
# unreachable code
MAX_HUNKS = 10
explanation_template = (
"This commit modifies only {} hunks (groups of contiguous lines of code)"
)

if candidate.hunk_count <= MAX_HUNKS:
return explanation_template.format(candidate.hunk_count)

return None


RULES = {
"CVE_ID_IN_COMMIT_MSG": Rule(apply_rule_cve_id_in_msg, 10),
"TOKENS_IN_DIFF": Rule(apply_rule_adv_keywords_in_diff, 7),
"TOKENS_IN_COMMIT_MSG": Rule(apply_rule_adv_keywords_in_msg, 5),
"TOKENS_IN_MODIFIED_PATHS": Rule(apply_rule_adv_keywords_in_paths, 10),
"SEC_KEYWORD_IN_COMMIT_MSG": Rule(apply_rule_security_keyword_in_msg, 5),
"GH_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_ghissue, 2),
"JIRA_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_jira_issue, 2),
"CHANGES_RELEVANT_FILE": Rule(apply_rule_changes_relevant_file, 8),
"COMMIT_IN_ADV": Rule(apply_rule_commit_mentioned_in_adv, 10),
"COMMIT_IN_REFERENCE": Rule(apply_rule_commit_mentioned_in_reference, 9),
"VULN_IN_LINKED_ISSUE": Rule(apply_rule_vuln_mentioned_in_linked_issue, 9),
"SEC_KEYWORD_IN_LINKED_GH": Rule(apply_rule_security_keyword_in_linked_gh, 5),
"SEC_KEYWORD_IN_LINKED_JIRA": Rule(apply_rule_security_keyword_in_linked_jira, 5),
"JIRA_ISSUE_IN_COMMIT_MSG_AND_ADV": Rule(
apply_rule_jira_issue_in_commit_msg_and_adv, 9
),
"GH_ISSUE_IN_COMMIT_MSG_AND_ADV": Rule(
apply_rule_gh_issue_in_commit_msg_and_adv, 9
),
"SMALL_COMMIT": Rule(apply_rule_small_commit, 0),
}

0 comments on commit c913a1c

Please sign in to comment.