In [1]:
import fitz
import re
import json
import os

In [24]:
def cleanText(text):
    cleanPattern = (
        r"Direcci\u00f3n de Seguridad.*?P\u00e1gina.\s+\d+\s+de\s+\d+\.?"
        r"|This document is for GBM Internal Use\.? Total or partial reproduction is prohibited\.?")
    return re.sub(cleanPattern, "", text, flags=re.IGNORECASE | re.DOTALL).strip()

In [18]:
def getVulnerabilities(pdfPath, serversToSearch):

    pdf = fitz.open(pdfPath)
    
    # Patrones regex para búsqueda
    titlePattern = r"\n([^\n]*\n[^\n]*)\nCVSS"
    criticalityPattern = r"Criticality\s+(?:CVSS:[^\n]+(?:\n\d+\.\d+)?)?\s*(Critical|High|Medium|Low)"
    affectedAssetsPattern = r"Affected Assets\s+([\s\S]*?)\nDescription"
    descriptionPattern = r"Description\s+([\s\S]*?)\nRecommendation"
    recommendationPattern = r"Recommendation\s+([\s\S]*?)\nLinks"
    descriptionPatternAux = r"Description\s+([\s\S]*)"
    linksPatternAux = "([\\s\\S]*)\nLinks"
    
    startPage = 10
    targetPage = 0 #Ocupado cuando la descripción tiene más de una página
    data = {} #Diccionario para creación de json
    vulnerabilityNumber = 1
    
    for pageNumber in range(startPage, pdf.page_count):

        #Saltar las páginas hasta el target asignado
        if pageNumber < targetPage:
            continue
    
        page = pdf.load_page(pageNumber)
        text = page.get_text()


        if any(server.lower() in text.lower() for server in serversToSearch):
            titleMatch = re.search(titlePattern, text, re.DOTALL)
            title = cleanText(titleMatch.group(1).strip()) if titleMatch else None
    
            criticalityMatch = re.search(criticalityPattern, text, re.DOTALL)
            criticality = criticalityMatch.group(1).strip() if criticalityMatch else None
    
            affectedAssetsMatch = re.search(affectedAssetsPattern, text, re.DOTALL)
            affectedAssets = affectedAssetsMatch.group(1).strip() if affectedAssetsMatch else None
            
            #Hay descripciones que se encuentran en más de una página, por lo tanto se debe buscar en las siguientes hasta que se encuente la información de links
            
            descriptionMatch = re.search(descriptionPatternAux, text, re.DOTALL)
            textAux = "Description\n" + descriptionMatch.group(1).strip() if descriptionMatch else None
    
            if descriptionMatch:
                for pageNumberAux in range(pageNumber+1, pdf.page_count):
                    page = pdf.load_page(pageNumberAux)
                    text = page.get_text()
            
                    linksMatch = re.search(linksPatternAux, text, re.DOTALL)
                    if linksMatch:
                        textAux += "\n" + linksMatch.group(1).strip() + "\nLinks"
                        break
                    else:
                        textAux += "\n" + text
                
                descriptionMatch = re.search(descriptionPattern, textAux, re.DOTALL)
                description = cleanText(descriptionMatch.group(1).strip()) if descriptionMatch else None
    
                recommendationMatch = re.search(recommendationPattern, textAux, re.DOTALL)
                recommendation = cleanText(recommendationMatch.group(1).strip()) if recommendationMatch else None
    
                data[f"Vulnerability{vulnerabilityNumber}"] = {
                    "Page" : f"{pageNumber+1}",
                    "Title": f"{title}",
                    "Criticality": f"{criticality}",
                    "AffectedAssets": f"{affectedAssets}",
                    "Description": f"{description}",
                    "Recommendation": f"{recommendation}"
                }
    
                vulnerabilityNumber += 1
                targetPage = pageNumberAux
    
    with open(f"{os.path.dirname(pdfPath)}/{os.path.splitext(os.path.basename(pdfPath))[0]}.json", "w") as file:
        json.dump(data, file, indent=4)
        print("Documento generado")

In [25]:
# pdfPath = "C:/Users/43263/Documents/Python/Enero2025PRB/InfraSec-EscaneoMensual_SPEI_Enero2025.pdf"
pdfPath = "C:/Users/43263/Documents/Python/Enero2025PRB/InfraSec-EscaneoMensual_Canales_Electrónicos_Enero2025.pdf"
# pdfPath = "C:/Users/43263/Documents/Python/Enero2025PRB/InfraSec-EscaneoMensual_SWIFT_Enero2025.pdf"
# serversToSearch = ["GBMSWIFT01", "GBMSWIFTAC01", "GBMDBSWIFT01"]
# serversToSearch = ["GBMSPEIDRIVE3", "GBMSPEIDRIVE"]
serversToSearch = ["AWCLEARINGS"]
getVulnerabilities(pdfPath, serversToSearch)

Documento generado
