## Imports and constants

In [1]:
import os
import re
import subprocess
import json
import requests
import time
import pymongo

In [None]:
with open('config.json', 'r') as file:
    config = json.load(file)

api_key_nist = config['api_key_nist']
api_key_cvedetails = config['api_key_cvedetails']
mongo_uri = config['mongo_uri']
mongo_vulns_key = config['mongo_vulns_key']
mongo_vulns_url = config['mongo_vulns_url']

In [15]:
mongo_uri = "mongodb+srv://asayha:jGfSDpdu04sMbHKO@asayha.v2pdvzn.mongodb.net/"
client = pymongo.MongoClient(mongo_uri)
    
# Select the database
db = client["FinalProject"]

# Collection for store software data
softwares_clt = db["Softwares"]

## Functions

In [3]:
def get_mac_address(os_name):
    try:
        # get the mac address
        if os_name == "Windows":
            mac_address = subprocess.check_output("getmac", shell=True)
            mac_address = re.search(r"\{.*\}", str(mac_address)).group(0)[1:-1]
            return mac_address
        if os_name == "Linux":
            mac_address = subprocess.check_output("ifconfig | grep ether", shell=True)

        # extract the mac address
        mac_address = re.search(r"\w\w:\w\w:\w\w:\w\w:\w\w:\w\w", str(mac_address))
        return mac_address.group(0)
    except Exception as e:
        print(f"Error: {e}")
        return None

In [4]:
def restructure_cvss_metrics(infoVulnJSON, metricVersion):
    """
    Store in a JSON (cvssMetricInfo) the organized data about cvss metrics
    
    Args: 
        infoVulnJSON: JSON with data still unorganized 
        metricVersion: especifies the cvss version which was calculated the metrics

    Returns:
        JSON: organized data about cvss metrics which contains:
            baseSeverity - baseScore - vertorString and its details - exploitabilityScore - impactScore - type - version cvss Metrics
    """
    cvssMetricInfo = {}
    cvssMetricInfo['baseScore'] = infoVulnJSON[0]['cvssData']['baseScore']
    # Depends cvss version used for metrics the baseSeverity is in diferent JSON's place
    if metricVersion == 31 or metricVersion == 30:
        cvssMetricInfo['baseSeverity'] = infoVulnJSON[0]['cvssData']['baseSeverity']
    else:
        cvssMetricInfo['baseSeverity'] = infoVulnJSON[0]['baseSeverity']
    # Remaining cvss metrics is equally located in any version
    cvssMetricInfo['exploitabilityScore'] = infoVulnJSON[0]['exploitabilityScore']
    cvssMetricInfo['impactScore'] = infoVulnJSON[0]['impactScore']
    cvssMetricInfo['type'] = infoVulnJSON[0]['type']
    cvssMetricInfo['cvssDetailedData'] = infoVulnJSON[0]['cvssData']
    # Deleting repeated info
    del cvssMetricInfo['cvssDetailedData']['baseScore']
    if metricVersion == 31 or metricVersion == 30:
        del cvssMetricInfo['cvssDetailedData']['baseSeverity']
    # Return final restructured JSON
    return cvssMetricInfo

In [14]:
def get_software_vulnerabilities(softwareName, startIndex = 0):
    """
    Obtain the info provided by services.nvd.nist.gov about the vulnerabilities of a software, then store the relevant info in a JSON
    
    Args: 
        softwareName: name of the software to search for vulnerabilities
    
    Returns:
        vulnerabilities: list of JSONs with the relevant info about the last 10 vulnerabilities found or less if the total is less than 10
    """
    print('Software a pasar = '+ softwareName) 
    endPoint = f'https://services.nvd.nist.gov/rest/json/cves/2.0?keywordSearch={softwareName}&startindex={startIndex}'

    headers = {
        'apiKey': api_key_nist
    }
    infoVuln = requests.get(endPoint,headers = headers)
    # if the response is not a JSON, print the response error and return
    try:
        infoVulnJSON = infoVuln.json()
    except ValueError:
        print('Error, the response was: ')
        print(infoVuln.json())
        return
    #list for store the last 10 vulnerabilities found or less if the total is less than 10
    vulnerabilities = []
    # if there is no vulnerability found, return an empty list
    totalResults = infoVulnJSON['totalResults']
    if totalResults == 0:
        return vulnerabilities
    if totalResults > 2000 and startIndex == 0:
        startIndex = totalResults - 2000
        get_software_vulnerabilities(softwareName, startIndex)

    # loop for get the last 10 vulnerabilities found, the loop needs to start for the last element
    endLoop = 0
    print("total Results ", totalResults)
    
    resultPerPage = infoVulnJSON['resultsPerPage']
    if resultPerPage > 10:
        endLoop = resultPerPage - 10

    print("endloop ", endLoop)
    for i in range(resultPerPage-1,endLoop-1,-1):
        vulnerability = infoVulnJSON['vulnerabilities'][i]['cve']
        print("requesting vulnerability: "+ str(i) +" " + vulnerability['id'] )
        print(json.dumps(vulnerability, indent=2))

        cvssMetricInfo = {}
        # calling the function restructure_cvss_metrics for get the cvss metrics organized in a JSON
        if 'cvssMetricV31' in vulnerability['metrics']:
            cvssMetricInfo = restructure_cvss_metrics(vulnerability['metrics']['cvssMetricV31'],31)
        elif 'cvssMetricV30' in vulnerability['metrics']:
            cvssMetricInfo = restructure_cvss_metrics(vulnerability['metrics']['cvssMetricV30'],30)
        elif 'cvssMetricV2' in vulnerability['metrics']:
            cvssMetricInfo = restructure_cvss_metrics(vulnerability['metrics']['cvssMetricV2'],2)

        description = vulnerability['descriptions'][0]['value'] if 'descriptions' in vulnerability else 'No description available'
        vulnStatus = vulnerability['vulnStatus'] if 'vulnStatus' in vulnerability else "No status provided"
        # store the relevant vulnerability info in a JSON
        vulnInfo = {
            "CVE_ID": vulnerability['id'],
            "description": description,
            "vulnStatus": vulnStatus,
            "metrics": cvssMetricInfo,
            "publishedDate": vulnerability['published'],
            "lastModifiedDate": vulnerability['lastModified'],
            "lastUpdate": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        vulnerabilities.append(vulnInfo)
        # sleep for 0.5 seconds to avoid the request limit
        time.sleep(0.5)

    return vulnerabilities

In [7]:
def insert_software(software):
    """
    Insert one software in the Atlas MongoDB
    Args: software: JSONs with the info to insert
    """
    try:
        softwares_clt.insert_one(software)
        print(f"Software {software['name']} inserted in the database")
    except Exception as e:
        print(f"Error: {e}")

In [8]:
def fix_software_names(machines_data):
    """
    This funtion will fix the software names in the machines_data, for the Name field in each software 
    in the softwaresData list, the clean process will be:
    - Verify if the software name is not null or empty, if it is, remove the software from the list
    - Remove the architecture information like (x64, x86, X64, X86, 32-bit, 64-bit)
    - Remove content in parentheses (including parentheses)
    - Remove content in '' (including '')
    - Replace - with space
    - Replace \u00f3 with o and \u00e9 with e (and other special characters)
    - Replace && or & to and
    - Remove double spaces
    - Remove initial and final spaces
    Args: machines_data: JSON with the data to fix
    Returns: JSON with the fixed data
    """
    for machine in machines_data:
        for software in machine['softwareData']:
            if software['Name'] == '' or software['Name'] == None:
                machine['softwareData'].remove(software)
                continue
            software['Name'] = re.sub(r'\([^)]*\)', '', software['Name'])
            software['Name'] = re.sub(r'\'.*\'', '', software['Name'])
            software['Name'] = re.sub(r'x64|x86|32-bit|64-bit|X86|X64', '', software['Name'])
            software['Name'] = software['Name'].replace('-', ' ')
            software['Name'] = re.sub(r'\s+', ' ', software['Name'])
            software['Name'] = re.sub(r'\u00f3', 'o', software['Name'])
            software['Name'] = re.sub(r'\u00e9', 'e', software['Name'])
            software['Name'] = re.sub(r'\u00e1', 'a', software['Name'])
            software['Name'] = re.sub(r'\u00ed', 'i', software['Name'])
            software['Name'] = re.sub(r'\u00fa', 'u', software['Name'])
            software['Name'] = re.sub(r'&{1,2}', 'and', software['Name'])
            software['Name'] = software['Name'].strip()
    return machines_data


In [9]:
def verify_software_in_db(softwareName, machineId):
    """
    Verify if the software is already in the database
    Args: softwareName: name of the software to verify
    Returns: boolean: True if the software is in the database, False if not
    """
    try:
        software = softwares_clt.find_one({"name": softwareName})
        if software == None:
            return "SOFTWARE NOT FOUND"
        if machineId in software['associatedMachines']:
            print("Already associated machine in the software")
            return "MACHINE FOUND"
        return "SOFTWARE FOUND"
    except Exception as e:
        print('Error in verify_software_in_db: ' + str(e))
        return "ERROR"

In [10]:
def add_machine_in_software(softwareName, machineId):
    """
    Add the machine information in the software in the database
    Args: softwareName: name of the software to add the machine
          machineId: id of the machine to add in associatedMachines field
    """
    try:
        softwares_clt.update_one({"name": softwareName}, {"$push": {"associatedMachines": machineId}})
        print("Machine associated in the software")
    except Exception as e:
        print('Error associating the machine Id into the software document: ' + str(e))

In [11]:
def get_vulns_data():
    """
    Comments
    Try to request the data in MongoDB, if it doesn't respond it uses the temporary data file.
    """
    
    url = mongo_vulns_url
     
    payload = "{\r\n    \"collection\":\"softwareByIP\",\r\n    \"database\":\"vulnsData\",\r\n    \"dataSource\":\"Cluster0\",\r\n    \"filter\": {}\r\n}'"
    headers = {
      'Content-Type': 'application/ejson',
      'apiKey': mongo_vulns_key
    }
    response =  requests.request("POST", url, headers=headers, data=payload)
    if response.status_code == 200:
        print("Data received from MongoDB")
        return response.json()
    else:
        print("Error, data received from temp file")
        with open('data_temp.json', 'r') as file:
            return json.load(file)


## Execution

In [12]:
def main():
    """
    Main function to process the data and store the software and vulnerabilities in the database
    """
    try:
        # 1. Retrieve the data from the MongoDB
        data = get_vulns_data()
    except Exception as e:
        print(f"Failed to retrieve or process data: {e}")

    # 2. Fix the document format
    if 'documents' in data:
        data = data['documents']
    # 3. Fix the software names   
    data = fix_software_names(data)
    for machine in data:
        for software in machine['softwareData']:
            # 4. Verify if the software is already in Softwares collection in MongoDB
            # and if the machineID is already associated
            verifySW = verify_software_in_db(software['Name'], machine['id'])
            if (verifySW == "SOFTWARE FOUND"):
                add_machine_in_software(software['Name'], machine['id'])
            elif (verifySW == "SOFTWARE NOT FOUND"):
                # 5. Get the vulnerabilities for each software
                vulnerabilities = get_software_vulnerabilities(software['Name'])
                # 6. Creating the final software JSON
                softwareJSON = {
                    "name": software['Name'],
                    "version": software['Version'],
                    "installDate": software['InstallDate'],
                    "associatedMachines": [machine['id']],
                    "vulnerabilities": vulnerabilities
                }
                # 7. Insert the software in the Softwares collection
                insert_software(softwareJSON)
    
            

In [None]:
# Start the program
if __name__ == "__main__":
    main()