Skip to content

Commit

Permalink
keywords extraction test using spacy related to #331, fixes #329
Browse files Browse the repository at this point in the history
  • Loading branch information
sacca97 authored and copernico committed Dec 12, 2022
1 parent d2d2b79 commit 3e8efca
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 179 deletions.
38 changes: 22 additions & 16 deletions prospector/client/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,19 @@ def main(argv): # noqa: C901
logger.setLevel(config.log_level)
logger.info(f"Global log level set to {get_level(string=True)}")

if config.cve_id is None:
logger.error("No vulnerability id was specified. Cannot proceed.")
console.print(
"No vulnerability id was specified. Cannot proceed.",
status=MessageStatus.ERROR,
)
return
nvd_rest_endpoint = configuration.get("nvd_rest_endpoint", "") # default ???

backend = args.backend or configuration.get("backend", DEFAULT_BACKEND) # ???

use_backend = args.use_backend

if args.ping:
return ping_backend(backend, log.config.level < logging.INFO)

vulnerability_id = args.vulnerability_id
repository_url = args.repository
vuln_descr = args.descr
filter_extensions = args.filter_extensions.split(",")

# if args.get("ping"):
# return ping_backend(backend, get_level() < logging.INFO)
Expand All @@ -70,15 +76,15 @@ def main(argv): # noqa: C901
logger.debug(f"time-limit after: {time_limit_after}")

results, advisory_record = prospector(
vulnerability_id=config.cve_id,
repository_url=config.repository,
publication_date=config.pub_date,
vuln_descr=config.description,
tag_interval=config.tag_interval,
filter_extensions=config.filter_extensions.split(","),
version_interval=config.version_interval,
modified_files=set(config.modified_files.split(",")),
advisory_keywords=set(config.keywords.split(",")),
vulnerability_id=vulnerability_id,
repository_url=repository_url,
publication_date=publication_date,
vuln_descr=vuln_descr,
tag_interval=tag_interval,
filter_extensions=filter_extensions,
version_interval=version_interval,
modified_files=set(modified_files),
advisory_keywords=set(advisory_keywords),
time_limit_before=time_limit_before,
time_limit_after=time_limit_after,
use_nvd=config.use_nvd,
Expand Down
45 changes: 32 additions & 13 deletions prospector/client/cli/prospector_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,38 @@ def prospector( # noqa: C901
filter_extensions,
)

# obtain a repository object
repository = Git(repository_url, git_cache)

# retrieve of commit candidates
candidates = get_candidates(
advisory_record,
repository,
tag_interval,
version_interval,
time_limit_before,
time_limit_after,
limit_candidates,
)
with ConsoleWriter("Obtaining initial set of candidates") as writer:

# obtain a repository object
repository = Git(repository_url, git_cache)

# retrieve of commit candidates
candidates = get_candidates(
advisory_record,
repository,
tag_interval,
version_interval,
time_limit_before,
time_limit_after,
filter_extensions[0],
)
_logger.debug(f"Collected {len(candidates)} candidates")

if len(candidates) > limit_candidates:
_logger.error(
"Number of candidates exceeds %d, aborting." % limit_candidates
)
_logger.error(
"Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory."
)
writer.print(
f"Found {len(candidates)} candidates, too many to proceed.",
status=MessageStatus.ERROR,
)
writer.print("Please try running the tool again.")
sys.exit(-1)

writer.print(f"Found {len(candidates)} candidates")

with ExecutionTimer(
core_statistics.sub_collection("commit preprocessing")
Expand Down
118 changes: 63 additions & 55 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,19 @@
from urllib.parse import urlparse

import requests
from dateutil.parser import isoparse
from pydantic import BaseModel, Field
import spacy

from log.logger import get_level, logger, pretty_log
from util.http import fetch_url

from .nlp import extract_affected_filenames, extract_products, extract_words_from_text
from .nlp import (
extract_affected_filenames,
extract_nouns_from_text,
extract_products,
extract_special_terms,
extract_versions,
)

ALLOWED_SITES = [
"github.com",
Expand Down Expand Up @@ -38,58 +45,58 @@


LOCAL_NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0"
NVD_API_KEY = os.getenv("NVD_API_KEY", "")


class AdvisoryRecord:
"""The advisory record captures all relevant information on the vulnerability advisory"""

def __init__(
self,
cve_id: str,
description: str = "",
published_timestamp: int = 0,
last_modified_timestamp: int = 0,
references: List[str] = None,
references_content: List[str] = None,
affected_products: List[str] = None,
versions: List[Tuple[str, str]] = None,
files: Set[str] = None,
keywords: Set[str] = None,
):
self.cve_id = cve_id
self.description = description
self.published_timestamp = published_timestamp
self.last_modified_timestamp = last_modified_timestamp
self.references = references or list()
self.references_content = references_content or list()
self.affected_products = affected_products or list()
self.versions = versions or list()
self.files = files or set()
self.keywords = keywords or set()
NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0?cveId="


# TODO: refactor and clean
class AdvisoryRecord(BaseModel):
"""
The advisory record captures all relevant information on the vulnerability advisory
"""

vulnerability_id: str
repository_url: str = ""
published_timestamp: int = 0
last_modified_timestamp: int = 0
references: List[str] = Field(default_factory=list)
references_content: List[str] = Field(default_factory=list)
affected_products: List[str] = Field(default_factory=list)
description: Optional[str] = ""
preprocessed_vulnerability_description: str = ""
relevant_tags: List[str] = None
versions: List[str] = Field(default_factory=list)
from_nvd: bool = False
nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
paths: Set[str] = Field(default_factory=set)
keywords: Set[str] = Field(default_factory=set)

# def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
# self.vulnerability_id = vulnerability_id
# self.repository_url = repository_url
# self.from_nvd = from_nvd
# self.nvd_rest_endpoint = nvd_rest_endpoint

def analyze(
self,
fetch_references: bool = False,
self, use_nvd: bool = False, fetch_references: bool = False, relevant_extensions: List[str] = []
):
self.versions = [
version for version in self.versions if version[0] != version[1]
]
# self.versions.extend(extract_versions(self.description))
# self.versions = list(set(self.versions))

self.affected_products.extend(extract_products(self.description))
self.affected_products = list(set(self.affected_products))

# TODO: this could be done on the words extracted from the description
self.files.update(extract_affected_filenames(self.description))

self.keywords.update(extract_words_from_text(self.description))
self.versions = union_of(self.versions, extract_versions(self.description))
self.affected_products = union_of(
self.affected_products, extract_products(self.description)
)
# TODO: use a set where possible to speed up the rule application time
self.paths.update(
extract_affected_filenames(self.description, relevant_extensions) # TODO: this could be done on the words extracted from the description
)

logger.debug("References: " + str(self.references))
# TODO: misses something because of subdomains not considered e.g. lists.apache.org
self.keywords.update(extract_nouns_from_text(self.description))

_logger.debug("References: " + str(self.references))
self.references = [
r
for r in self.references
Expand Down Expand Up @@ -156,15 +163,16 @@ def get_from_local(vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOIN


def build_advisory_record(
cve_id: str,
description: str = None,
nvd_rest_endpoint: str = None,
fetch_references: bool = False,
use_nvd: bool = True,
publication_date: str = None,
advisory_keywords: Set[str] = None,
modified_files: Set[str] = None,
filter_extensions: List[str] = None,
vulnerability_id: str,
repository_url: str,
vuln_descr: str,
nvd_rest_endpoint: str,
fetch_references: bool,
use_nvd: bool,
publication_date: str,
advisory_keywords: Set[str],
modified_files: Set[str],
filter_extensions: List[str],
) -> AdvisoryRecord:

advisory_record = AdvisoryRecord(
Expand All @@ -187,11 +195,11 @@ def build_advisory_record(
isoparse(publication_date).timestamp()
)

if advisory_keywords and len(advisory_keywords) > 0:
if len(advisory_keywords) > 0:
advisory_record.keywords.update(advisory_keywords)

if modified_files and len(modified_files) > 0:
advisory_record.files.update(modified_files)
if len(modified_files) > 0:
advisory_record.paths.update(modified_files)

logger.debug(f"{advisory_record.keywords=}")
logger.debug(f"{advisory_record.files=}")
Expand Down
39 changes: 16 additions & 23 deletions prospector/datamodel/nlp.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
import os
import re
from typing import Dict, List, Set

import requests

# from util.http import extract_from_webpage, fetch_url, get_from_xml
from spacy import load

from typing import Dict, List, Set, Tuple
from util.http import extract_from_webpage, fetch_url
from spacy import Language, load
from datamodel.constants import RELEVANT_EXTENSIONS
from util.http import get_from_xml

JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")


nlp = load("en_core_web_sm")

nlp = load("en_core_web_sm")


Expand All @@ -37,25 +35,20 @@ def extract_special_terms(description: str) -> Set[str]:
return tuple(result)


def extract_words_from_text(text: str) -> Set[str]:
"""Use spacy to extract "relevant words" from text"""
# Lemmatization
return set(
[
token.lemma_.casefold()
for token in nlp(text)
if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
]
)
def extract_nouns_from_text(text: str) -> List[str]:
"""Use spacy to extract nouns from text"""
return [
token.text
for token in nlp(text)
if token.pos_ == "NOUN" and len(token.text) > 3
]


def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
def extract_similar_words(
adv_words: Set[str], commit_msg: str, blocklist: Set[str]
) -> List[str]:
"""Extract nouns from commit message that appears in the advisory text"""
commit_words = {
word for word in extract_words_from_text(commit_msg) if word not in exclude
}
return commit_words.intersection(adv_words)
# return [word for word in extract_words_from_text(commit_msg) if word in adv_words]
return [word for word in extract_nouns_from_text(commit_msg) if word in adv_words]


def extract_versions(text: str) -> List[str]:
Expand Down
22 changes: 13 additions & 9 deletions prospector/datamodel/nlp_test.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
from .nlp import (
extract_affected_filenames,
extract_ghissue_references,
extract_jira_references,
find_similar_words,
extract_similar_words,
extract_special_terms,
)


def test_extract_similar_words():
commit_msg = "Is this an advisory message?"
adv_text = "This is an advisory description message"
similarities = find_similar_words(
set(adv_text.casefold().split()), commit_msg, "simola"
)
assert similarities.pop() == "message"
commit_msg = "This is a commit message"
adv_text = "This is an advisory text"
similarities = extract_similar_words(adv_text, commit_msg, set())
assert similarities.sort() == ["This"].sort()


@pytest.mark.skip(reason="Outdated")
def test_adv_record_path_extraction_no_real_paths():
result = extract_affected_filenames(ADVISORY_TEXT_1)

assert result == []


ADVISORY_TEXT_1 = """CXF supports (via JwtRequestCodeFilter) passing OAuth 2 parameters via a JWT token as opposed to query parameters (see: The OAuth 2.0 Authorization Framework: JWT Secured Authorization Request (JAR)). Instead of sending a JWT token as a "request" parameter, the spec also supports specifying a URI from which to retrieve a JWT token from via the "request_uri" parameter. CXF was not validating the "request_uri" parameter (apart from ensuring it uses "https) and was making a REST request to the parameter in the request to retrieve a token. This means that CXF was vulnerable to DDos attacks on the authorization server, as specified in section 10.4.1 of the spec. This issue affects Apache CXF versions prior to 3.4.3; Apache CXF versions prior to 3.3.10."""
Expand Down
2 changes: 2 additions & 0 deletions prospector/rules/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Dict, Set

import pandas
from spacy import load
import spacy

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
Expand Down
Loading

0 comments on commit 3e8efca

Please sign in to comment.