In [21]:
import requests
from bs4 import BeautifulSoup
import re
import spacy

nlp = spacy.load("en_core_web_sm")


url = "https://nvd.nist.gov/vuln/search/results?form_type=Basic&results_type=overview&query=apache&search_type=all&isCpeNameSearch=false"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")


cve_links = soup.find_all("a", {"data-testid": lambda value: value and value.startswith("vuln-detail-link")})
cve_ids = [link.text.strip() for link in cve_links[:20]]
cvss_cell = row.select_one("td:nth-child(3)")
if cvss_cell:
    cvss_a_tag = cvss_cell.find("a")
    if cvss_a_tag:
        cvss_value = cvss_a_tag.get_text(strip=True)
    else:
        cvss_em_tag = cvss_cell.find("em")
        cvss_value = cvss_em_tag.next_sibling.strip() if cvss_em_tag and cvss_em_tag.next_sibling else "(no CVSS score)"
else:
    cvss_value = "(no CVSS score)"


paragraphs = soup.select('#row > table > tbody > tr > td:nth-child(2) > p')[:20]
next_words = []
fixed_versions = []
affected_versions = []
next_products = []

version_pattern = r"(\d+\.\d+\.\d+[-\.]?[A-Za-z0-9]*)(?:\s*(?:through|to)\s*(\d+\.\d+\.\d+[-\.]?[A-Za-z0-9]*))?"

def extract_versions(text):
    """Extract version numbers, including ranges (e.g., '1.10.0 through 1.27.0')."""
    return re.findall(version_pattern, text)

def format_version_range(versions):
    """Format version ranges to use 'to' between versions and 'and' between ranges."""
    formatted_ranges = []

    for version in versions:

        version = version.replace('through', 'to')

        formatted_ranges.append(version)

    return ' and '.join(formatted_ranges)


results = []

for i, paragraph in enumerate(paragraphs):

    if i >= len(cve_ids):
        break


    affected_versions = []
    fixed_versions = []

    text_content = paragraph.get_text(strip=True)
    words = text_content.split()


    doc = nlp(text_content)


    for sent in doc.sents:

        if "affects" in sent.text.lower():
            affected_versions += extract_versions(sent.text)


        if "upgrade" in sent.text.lower():
            fixed_versions += extract_versions(sent.text)


    if 'Apache' in words:
        apache_index = words.index('Apache')
        if apache_index + 1 < len(words):
            next_word = words[apache_index + 1]
        else:
            next_word = None
    else:
        next_word = None

    next_words.append(next_word)
    if next_word:
        next_products.append(next_word)
    else:
        next_products.append(None)
    formatted_affected_versions = format_version_range([f"{match[0]} to {match[1]}" if match[1] else match[0] for match in affected_versions])
    formatted_fixed_versions = format_version_range([f"{match[0]} to {match[1]}" if match[1] else match[0] for match in fixed_versions])
    results.append({
        "CVE ID": cve_ids[i],
        "Product": next_word,
        "Affected Version": formatted_affected_versions if affected_versions else "(no affected versions)",
        "Corrected Version": formatted_fixed_versions if fixed_versions else "(no fixed versions)",
        "CVSS Score": cvss_value

    })
import pandas as pd
df = pd.DataFrame(results)

df



Unnamed: 0,CVE ID,Product,Affected Version,Corrected Version,CVSS Score
0,CVE-2024-51569,NimBLE.,1.7.0.,1.8.0,(not available)
1,CVE-2024-47250,NimBLE.,1.7.0.,1.8.0,(not available)
2,CVE-2024-47249,NimBLE.,1.7.0.,1.8.0,(not available)
3,CVE-2024-47248,NimBLE.,1.7.0.,1.8.0,(not available)
4,CVE-2024-45719,Answer.,1.4.0.,1.4.1,(not available)
5,CVE-2024-52067,NiFi,(no affected versions),(no fixed versions),(not available)
6,CVE-2024-31141,Kafka,2.3.0 to 3.5.2 and 3.6.2 and 3.7.0. and 3.8.0,2.3.0 to 3.5.2 and 3.6.2 and 3.7.0. and 3.8.0,(not available)
7,CVE-2024-52318,Tomcat.,11.0.0 and 10.1.31 and 9.0.96.,11.0.1 and 10.1.32 and 9.0.97,(not available)
8,CVE-2024-52317,Tomcat.,11.0.0-M23 to 11.0.0-M26 and 10.1.27 to 10.1.3...,11.0.0 and 10.1.31 and 9.0.96,(not available)
9,CVE-2024-52316,Tomcat.,11.0.0-M1 to 11.0.0-M26 and 10.1.0-M1 to 10.1....,11.0.0 and 10.1.31 and 9.0.96,(not available)


In [20]:
import requests
from bs4 import BeautifulSoup
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")


url = "https://nvd.nist.gov/vuln/search/results?form_type=Basic&results_type=overview&query=apache&search_type=all&isCpeNameSearch=false"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")


cve_links = soup.find_all("a", {"data-testid": lambda value: value and value.startswith("vuln-detail-link")})
cve_ids = [link.text.strip() for link in cve_links[:20]]


cvss_scores = []
for row in soup.select('tr'):
    cvss_cell = row.select_one("td:nth-child(3)")
    if cvss_cell:
        cvss_value = cvss_cell.get_text(strip=True)
        if not cvss_value or "not available" in cvss_value.lower():
            cvss_scores.append("(no CVSS score)")
        else:
            cvss_scores.append(cvss_value)
    else:
        cvss_scores.append("(no CVSS score)")
paragraphs = soup.select('#row > table > tbody > tr > td:nth-child(2) > p')[:20]
def extract_versions(text):
    """Use SpaCy to extract version numbers from the text."""
    doc = nlp(text)
    versions = []

    for ent in doc.ents:
        if ent.label_ == "CARDINAL" and any(char.isdigit() for char in ent.text):
            versions.append(ent.text)
    return versions


def split_versions_based_on_keywords(text):
    """Split versions into affected and fixed based on keywords."""
    affected_versions = []
    fixed_versions = []
    doc = nlp(text)

    for sent in doc.sents:

        if "affects" in sent.text.lower():
            affected_versions += extract_versions(sent.text)

        if "upgrade" in sent.text.lower() or "upgrading" in sent.text.lower():
            fixed_versions += extract_versions(sent.text)

    return affected_versions, fixed_versions


results = []

for i, paragraph in enumerate(paragraphs):

    if i >= len(cve_ids) or i >= len(cvss_scores):
        break
    text_content = paragraph.get_text(strip=True)

    affected_versions, fixed_versions = split_versions_based_on_keywords(text_content)


    if 'Apache' in text_content:
        words = text_content.split()
        apache_index = words.index('Apache')
        next_word = words[apache_index + 1] if apache_index + 1 < len(words) else None
    else:
        next_word = None
    if affected_versions == fixed_versions:
        affected_versions = [f"before {ver}" for ver in affected_versions]

    results.append({
        "CVE ID": cve_ids[i],
        "Product": next_word if next_word else "(no product found)",
        "Affected Version": ", ".join(affected_versions) if affected_versions else "(no affected versions)",
        "Fixed Version": ", ".join(fixed_versions) if fixed_versions else "(no fixed versions)",
        "CVSS Score": cvss_scores[i] if cvss_scores[i] else "(no CVSS score)"
    })

df = pd.DataFrame(results)

print(df.to_string(index=False))


        CVE ID    Product                         Affected Version           Fixed Version      CVSS Score
CVE-2024-51569    NimBLE.                                    1.7.0                   1.8.0 (no CVSS score)
CVE-2024-47250    NimBLE.                                    1.7.0                   1.8.0 (no CVSS score)
CVE-2024-47249    NimBLE.                                    1.7.0                   1.8.0 (no CVSS score)
CVE-2024-47248    NimBLE.                                    1.7.0                   1.8.0 (no CVSS score)
CVE-2024-45719    Answer.                                    1.4.0                   1.4.1 (no CVSS score)
CVE-2024-52067       NiFi                   (no affected versions)           2.0.0, 1.28.1 (no CVSS score)
CVE-2024-31141      Kafka before 2.3.0, before 3.7.0, before 3.8.0     2.3.0, 3.7.0, 3.8.0 (no CVSS score)
CVE-2024-52318    Tomcat.                                   11.0.0 11.0.1, 10.1.32, 9.0.97 (no CVSS score)
CVE-2024-52317    Tomcat.     11.0.0-