# Iterating over Requirement-JSON and associating Toolings

In [None]:
%%time

from IPython.display import clear_output

! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu

! pip install sentence_transformers==2.2.2
! pip install -qq -U InstructorEmbedding

! pip install -qq -U transformers 
! pip install -qq -U accelerate

! pip install flash-attn

clear_output()

In [None]:
%%time

! pip install -qq -U bitsandbytes

clear_output()

In [None]:
import json
import re
import warnings
warnings.filterwarnings("ignore")
import gc
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

json_path = '/kaggle/input/bsi-itgs-json/recordsBDEW.json'

class JSONLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self):
        try:
            # Lesen der ersten 3000 Zeichen zur Überprüfung
            with open(self.file_path, 'r', encoding='utf-8-sig') as file:
                sample_text = file.read(3000)
                #print(sample_text)
                file.seek(0)
                
                # Bereinigung der Datei und Laden des JSON
                cleaned_data = ""
                for line in file:
                    # Entferne ungültige Steuerzeichen (zum Beispiel unescaped backslashes)
                    cleaned_line = re.sub(r'[\x00-\x1F\x7F]', '', line)
                    cleaned_data += cleaned_line
                data = json.loads(cleaned_data)   
            print(f"Datei '{self.file_path}' erfolgreich geladen.")
            return data
        except Exception as e:
            print(f"Fehler beim Laden der Datei: {e}")
            return None
    
loader = JSONLoader(json_path)
documents = loader.load()

In [None]:
if documents:
    for i, requirement in enumerate(documents[:5]):
        print(f"Anforderung {i + 1}:")
        print(f"ID: {requirement.get('c.id')}")
        print(f"Title: {requirement.get('c.title')}")
        print(f"Statement: {requirement.get('c.statement')}")
        print("-" * 40)

else:
    print("Die Daten konnten nicht geladen werden.")

In [None]:
model_name = 'microsoft/Phi-3-mini-128k-instruct'

In [None]:
def build_model(model_repo = model_name):

    print('\nDownloading model: ', model_repo, '\n\n')

    ### tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_repo)

    ### quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.float16,
        bnb_4bit_use_double_quant = True,
    )        

    ### model
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config = bnb_config,
        device_map = 'auto',
        low_cpu_mem_usage = True,
        trust_remote_code = True,
    )

    return tokenizer, model

In [None]:
%%time

tokenizer, model = build_model(model_repo = model_name)

clear_output()

In [None]:
model.eval()

---
## Beispiele einbinden 
(1x richtig, 1x falsch) damit Modell lernt - in Prompt == ***Few-Shot Leaning***

0-2-4 (Beispiele) für BA nutzen und abwägen wie sehr sich Performance/Genauigkeit verbessert

Tendenz untersuchen zwischen Anzahl max_tokens und Genauigkeit??




#### **Notebook vorbereiten, damit man nach dem Theorie-Teil gut in den Methodik-Teil wieder einsteigen kann!**

---

In [None]:
%%time


terminators = [
    tokenizer.eos_token_id,
    tokenizer.bos_token_id
]

# Pipeline für Textgenerierung laden
text_generation = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    eos_token_id = terminators,
    max_new_tokens = 2000,
    temperature = 0.1,
)

# Funktion zur Generierung von Prompts und zur Überprüfung der Anforderungen
def check_requirements_against_tools(documents, tools):
    
    results = []
    
    requirements = documents[:6]

    for requirement in requirements:
        prompt = f"""
            You are an expert assistant with extensive knowledge in evaluating tools for specific requirements.
            Your task is to assess whether the given tools can be used to test the specified requirement.

            Please evaluate the tools based on the information provided:

            Tools: {tools}

            Evaluate if any of the tools can be used to test the requirement.
            Choose only the most suitable tool for each requirement. 
            If none of the tools are suitable, use 'none' for all fields.
            If the requirement is about documenting or planning something, also use 'none' for all fields of the tool.

            Here are some examples to guide your evaluation:

            Example 1:
            {{
                "title": "Protection Against Denial-of-Service Attacks (H)",
                "id": "APP.3.2.A18",
                "statement": "A web server SHOULD be monitored continuously. Furthermore, safeguards that prevent or at least mitigate DDoS attacks SHOULD be defined and implemented.",
                "tools": [
                    {{
                        "name": "Hping-three",
                        "description": "Hping-three is an advanced open-source network tool used for crafting and sending custom TCP/IP packets. It is widely utilized by network administrators, security professionals, and ethical hackers for testing and auditing network security.",
                        "reason": "Hping-three can simulate DDoS attacks to test and validate the effectiveness of mitigation strategies.",
                        "applicability": "Strongly suitable"
                    }}
                ]
            }}

            Example 2:
            {{
                "title": "Appointing Contact Persons [Central Administration] (S)",
                "id": "APP.3.2.A20",
                "statement": "An organisation that maintains extensive websites SHOULD designate a contact person for them. Processes, procedures, and persons responsible for problems or security incidents SHOULD be specified. The organisation SHOULD post a contact option on its website that enables external parties to report security issues to the organisation. The organisation SHOULD define processes for handling external security reports.",
                "tools": [
                    {{
                        "name": "none",
                        "description": "none",
                        "reason": "none",
                        "applicability": "none"
                    }}
                ]
            }}

            Note: Any requirement related to documenting or planning tasks should automatically have 'none' for the tool, as there is no tool available to test such requirements.
            
            Example 3:
            {{
                "title": "Analysis of Log Data",
                "id": "APP.3.6.A15",
                "statement": "DNS server log data SHOULD be checked regularly. The DNS server log data SHOULD be evaluated regularly. At a minimum, the following security-relevant events SHOULD be evaluated: number of DNS requests, number of errors in DNS requests, errors in extension mechanisms for DNS (EDNS), expiring zones, failed zone transfers, changes in the ratio of errors to DNS requests",
                "tools": [
                    {{
                        "name": "Wireshark",
                        "description": "Wireshark is a widely-used, open-source network protocol analyzer that allows users to capture and interactively analyze data traffic flowing through a network in real-time. It provides deep visibility into the network, making it possible to diagnose issues, troubleshoot network problems, and inspect the details of network communications.",
                        "reason": "Wireshark can analyze network traffic and provide insights into DNS requests and errors, though it is not specifically designed for DNS server log analysis.",
                        "applicability": "Moderately suitable"
                    }}
                ]
            }}
            
            Example 4:
            {{
                "title": "Secure Basic Configuration of a DNS Server",
                "id": "APP.3.6.A4",
                "statement": "A resolving DNS server MUST be configured to only accept requests from the internal network. If a resolving DNS server sends requests, it MUST use random source ports. If DNS servers delivering forged domain information are known, the resolving DNS server MUST be prevented from sending requests to these DNS servers. An advertising DNS server MUST be configured to always handle requests from the Internet iteratively. It MUST be ensured that DNS zone transfers between primary and secondary DNS servers function appropriately. Zone transfers MUST be configured so that they are only possible between primary and secondary DNS servers. Zone transfers MUST be limited to certain IP addresses. The version of the DNS server product used MUST be hidden.",
                "tools": [
                    {{
                        "name": "Nmap",
                        "description": "Nmap is a versatile open-source tool for network discovery and security auditing. It identifies active devices, open ports, and services, and can detect operating systems and software versions, making it essential for network administrators and security experts.",
                        "reason": "Nmap can be used to check for open ports and services on a DNS server, which is relevant to ensuring that the DNS server’s configuration is secure. However, it does not directly address all aspects of DNS server configuration.",
                        "applicability": "Moderately suitable"
                    }}
                ]
            }}



            Now, evaluate the following requirement:

            Requirement:
            ID: {requirement['c.id']}
            Title: {requirement['c.title']}
            Statement: {requirement['c.statement']}

            Answer:
        """
        
        # Text generieren
        output = text_generation(prompt, num_return_sequences=1)
        
        # Ergebnisse speichern
        results.append(output[0]['generated_text'])

    return results

# Beispiel für die Nutzung der Funktion
tools = [
    {"name": "Nmap","description": "Nmap is a versatile open-source tool for network discovery and security auditing. It identifies active devices, open ports, and services, and can detect operating systems and software versions, making it essential for network administrators and security experts."},
    {"name": "OpenVAS","description": "OpenVAS is a comprehensive open-source tool for vulnerability scanning and management. It identifies and assesses security vulnerabilities in networks and systems, making it a key tool for security audits."},
    {"name": "Wireshark","description": "Wireshark is a widely-used, open-source network protocol analyzer that allows users to capture and interactively analyze data traffic flowing through a network in real-time. It provides deep visibility into the network, making it possible to diagnose issues, troubleshoot network problems, and inspect the details of network communications."},
    {"name": "hping-three","description": "Hping-three is an advanced open-source network tool used for crafting and sending custom TCP/IP packets. It is widely utilized by network administrators, security professionals, and ethical hackers for testing and auditing network security. Hping-three is known for its versatility in manipulating packet headers, enabling users to generate custom network traffic for various testing purposes."},
    {"name": "iPerf","description": "iPerf is a robust, open-source tool designed for measuring and analyzing network bandwidth performance. It is widely used by network administrators, engineers, and researchers to conduct precise and controlled tests of network throughput, helping users identify potential bottlenecks and optimize network performance."},
    {"name": "Hydra","description": "Hydra is a powerful, open-source password-cracking tool used for conducting brute-force attacks on various network services and applications. It is known for its speed, flexibility, and extensive protocol support, making it essential for penetration testing and assessing password security."},
    {"name": "OpenSSL","description": "OpenSSL is a widely-used, open-source toolkit that provides robust cryptographic functionality for securing data communications over networks. It supports SSL and TLS protocols, enabling encrypted connections between clients and servers, and offers tools for cryptographic operations including key generation and certificate management."},
    {"name": "SELinux","description": "SELinux (Security-Enhanced Linux) is a powerful security module integrated into the Linux kernel that provides advanced access control mechanisms. It enforces Mandatory Access Control (MAC) policies, which enhance system security by restricting actions based on security labels assigned to files, processes, and resources."},
    {"name": "systemctl","description": "systemctl is a command-line utility for managing and interacting with the systemd system and service manager in Linux environments. It is used to control system services, manage system states, and monitor system performance."}
]

results = check_requirements_against_tools(documents, tools)

# Ergebnisse anzeigen
for result in results:
    print(result)
    print("#####################################################################################################")
    print("#####################################################################################################")
    print("#####################################################################################################")

- 6 Anforderungen mit 9x Tools = 9min 29s (FSL-0)
- 6 Anforderungen mit 9x Tools = 11min 42s (FSL-2)
- 6 Anforderungen mit 9x Tools = 14min 47s (FSL-4)

#### stats
- 1x Anforderung + 1x Tool (normale Länge) = 1min 1s | 1min
- 1x Anforderung + 1x Tool (2x Länge) = 2min 16s | 2min 17s
- 1x Anforderung + 1x Tool (0,5x Länge) = 2min 17s | 2min 17s
---
- 1x Anforderung und 2x Tools = 48,5s
- 1x Anforderung und 1x Tool = 1min 14s
- 2x Anforderung und 1x Tool = 3min 30s
- 2x Anforderung und 2x Tools = 1min 50s

---

# RAG ANSATZ

In [1]:
%%time

from IPython.display import clear_output

! pip install -qq -U langchain
! pip install -qq -U langchain_community
! pip install -qq -U langchain_experimental
! pip install -qq -U langchain-huggingface

! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu

! pip install sentence_transformers==2.2.2
! pip install -qq -U InstructorEmbedding

! pip install -qq -U transformers 
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

! pip install flash-attn

clear_output()

CPU times: user 3.01 s, sys: 739 ms, total: 3.75 s
Wall time: 3min 41s


In [2]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time
import gc

import langchain

### loaders
from langchain.document_loaders import TextLoader, DirectoryLoader, PyPDFLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter, RecursiveJsonSplitter

### prompts
from langchain import PromptTemplate

### vector stores
from langchain_community.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain_huggingface import HuggingFacePipeline as HuggFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch

import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 11.3 s, sys: 1.73 s, total: 13 s
Wall time: 18.9 s


In [3]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.2.16
torch: 2.1.2
transformers: 4.44.2


In [4]:
class CFG:
    DEBUG = False
    
    # LLM
    model_name = 'microsoft/Phi-3-mini-128k-instruct'
    temperature = 0.1
    top_p = 0.90
    repetition_penalty = 1.15
    max_len = 15000
    max_new_tokens = 5000

    # splitting
    split_chunk_size = 2600
    split_overlap = 300
    
    # embeddings
    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'

    # similar passages
    k = 4
    
    # paths
    JSONs_path = '/kaggle/input/bsi-itgs-json/recordsBDEW_with_tools_FSL_4.json'
    Embeddings_path =  '/kaggle/input/bsi-itgs-json'
    Output_folder = './bsi-itgs-vectordb-json'

In [5]:
import json
import re

json_path = '/kaggle/input/bsi-itgs-json/recordsBDEW_with_tools_FSL_4.json'

class JSONLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self):
        try:
            # Lesen der ersten 3000 Zeichen zur Überprüfung
            with open(self.file_path, 'r', encoding='utf-8-sig') as file:
                sample_text = file.read(3000)
                # print(sample_text)
                file.seek(0)
                
                # Bereinigung der Datei und Laden des JSON
                cleaned_data = ""
                for line in file:
                    # Entferne ungültige Steuerzeichen (zum Beispiel unescaped backslashes)
                    cleaned_line = re.sub(r'[\x00-\x1F\x7F]', '', line)
                    cleaned_data += cleaned_line
                data = json.loads(cleaned_data)   
            print(f"Datei '{self.file_path}' erfolgreich geladen.")
            return data
        except Exception as e:
            print(f"Fehler beim Laden der Datei: {e}")
            return None
    
loader = JSONLoader(json_path)
documents = loader.load()

Datei '/kaggle/input/bsi-itgs-json/recordsBDEW_with_tools_FSL_4.json' erfolgreich geladen.


In [6]:
# Funktion zum Erstellen eines Chunks für jedes JSON-Objekt
def split_into_individual_chunks(data):
    # Erstelle einen Chunk für jedes JSON-Objekt
    chunks = [[item] for item in data]
    return chunks

# Beispielhafte Verwendung der Funktion mit deinen geladenen Daten
chunks = split_into_individual_chunks(documents)

print(f'We have created {len(chunks)} chunks.')

# Ausgabe der ersten 3 Chunks
print("\nErste 3 Chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(chunk)
print("#################################################################################")
    
# ------------------------------------------------------------------
# schauen ob der längste Chunk ordnungsgemäß getrennt wurde,
# oder ob Teile vom nächsten mit drin sind / was fehlt
# ------------------------------------------------------------------
def find_chunk_by_id(chunks, target_id):
    for chunk in chunks:
        if isinstance(chunk, list):
            for item in chunk:
                if isinstance(item, dict) and item.get('id') == target_id:
                    return chunk
    return None

target_id = '4.6.1'

chunk_with_target_id = find_chunk_by_id(chunks, target_id)
if chunk_with_target_id:
    print(f"\nChunk mit ID {target_id}:")
    print(chunk_with_target_id)
else:
    print(f"Kein Chunk gefunden mit ID {target_id}.")


We have created 35 chunks.

Erste 3 Chunks:

Chunk 1:
[{'title': 'Secure System Architecture', 'id': '4.1.1', 'statement': 'ISO/IEC 27002:2013 / 27019:2017: 9.4.1, 13.1.3, 14.2.5, 14.2.7, 17.2.1 Individual components and the entire system shall be designed and developed to support secure operations. Secure system design principles include: Security by design: The entire system and its individual components are designed on the basis of and with a focus on security. Deliberate attacks and unauthorised actions are explicitly taken into account while any repercussions arising from a security event are minimised by the system’s inherent design. Minimal need-to-know principle: Each component and each user is only assigned the rights they need to execute a desired action. Applications and network services, for example, are not run under administrator privileges, but only with the bare minimum of required system access rights. Defence-in-depth principle: Security risks are not tackled via sing

In [7]:
from langchain_core.documents import Document

# Erstelle die Dokumente
documents = []
for chunk in chunks:
    for inner_chunk in chunk:
        # Erstelle den erweiterten page_content
        tools_str = '\n'.join([
            f"Name: {tool['name']}; Description: {tool['description']}; Reason: {tool['reason']}; Applicability: {tool['applicability']}"
            for tool in inner_chunk.get('tools', [])
        ])
        
        page_content = (
            f"ID: {inner_chunk.get('id', '')}; "
            f"Title: {inner_chunk.get('title', '')}; "
            f"Statement: {inner_chunk.get('statement', '')}; "
            f"Tool: {tools_str}"
        )

        documents.append(Document(
            page_content=page_content,
            metadata={}  # Keine zusätzlichen Metadaten erforderlich, da sie im page_content enthalten sind
        ))

# Überprüfe die ersten paar Dokumente
print("Beispielhafte Dokumente nach der Umwandlung:")
for i, doc in enumerate(documents[:3]):
    print(f"Dokument {i}:")
    print(f"Content:\n\n{doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("")

Beispielhafte Dokumente nach der Umwandlung:
Dokument 0:
Content:

ID: 4.1.1; Title: Secure System Architecture; Statement: ISO/IEC 27002:2013 / 27019:2017: 9.4.1, 13.1.3, 14.2.5, 14.2.7, 17.2.1 Individual components and the entire system shall be designed and developed to support secure operations. Secure system design principles include: Security by design: The entire system and its individual components are designed on the basis of and with a focus on security. Deliberate attacks and unauthorised actions are explicitly taken into account while any repercussions arising from a security event are minimised by the system’s inherent design. Minimal need-to-know principle: Each component and each user is only assigned the rights they need to execute a desired action. Applications and network services, for example, are not run under administrator privileges, but only with the bare minimum of required system access rights. Defence-in-depth principle: Security risks are not tackled via sing

In [8]:
%%time

### we create the embeddings if they do not already exist in the input folder
if not os.path.exists(CFG.Embeddings_path + '/embeddings'):
    
    print('Creating embeddings...\n\n')

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = documents, 
        embedding = embeddings
    )

    vectordb.save_local(f"{CFG.Output_folder}/output") # save in output folder

clear_output()

CPU times: user 3.53 s, sys: 2.27 s, total: 5.81 s
Wall time: 20.6 s


In [9]:
%%time

embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

vectordb = FAISS.load_local(
    CFG.Output_folder + '/output',
    embeddings,
    allow_dangerous_deserialization = True,
)

clear_output()

CPU times: user 107 ms, sys: 11.6 ms, total: 119 ms
Wall time: 118 ms


In [10]:
%%time

### test if vector DB was loaded correctly
vectordb.similarity_search('Network')

CPU times: user 133 ms, sys: 40.5 ms, total: 173 ms
Wall time: 183 ms


[Document(page_content='ID: 4.8.1; Title: Back-up: Concept, Method, Documentation, Testing; Statement: ISO/IEC 27002:2013 / 27019:2017: 12.1.1, 12.3.1 Documented and tested procedures for data back-up and recovery of the individual components resp. the entire system and the respective configurations shall exist. There shall be the possibility for central back-up of the configuration parameters of distributed components. After relevant system updates, the documentation and procedures shall be updated and retested accordingly.; Tool: Name: none; Description: none; Reason: none; Applicability: none'),
 Document(page_content='ID: 4.7.2; Title: Secure Update Processes; Statement: ISO/IEC 27002:2013 / 27019:2017: 12.5.1, 14.2.2, 14.2.3, 14.2.7, 14.2.9 The provision and installation of updates, extensions and patches needs to occur according to a defined process and in coordination with the client.; Tool: Name: none; Description: none; Reason: none; Applicability: none'),
 Document(page_conte

In [11]:
def build_model(model_repo = CFG.model_name):

    print('\nDownloading model: ', model_repo, '\n\n')

    ### tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_repo)

    ### quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.float16,
        bnb_4bit_use_double_quant = True,
    )        

    ### model
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config = bnb_config,
        device_map = 'auto',
        low_cpu_mem_usage = True,
        trust_remote_code = True,
    )

    return tokenizer, model

In [12]:
%%time

tokenizer, model = build_model(model_repo = CFG.model_name)

clear_output()

CPU times: user 18 s, sys: 15.8 s, total: 33.8 s
Wall time: 45.9 s


In [13]:
gc.collect()

51

In [14]:
model.eval()

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_fe

In [15]:
%%time

from langchain_huggingface import HuggingFacePipeline

terminators = [
    tokenizer.eos_token_id,
    tokenizer.bos_token_id
]


### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    
    model = model,
    
    tokenizer = tokenizer,
    eos_token_id = terminators,
    
    do_sample = True,
    max_new_tokens = CFG.max_new_tokens,
    
    
    temperature = CFG.temperature,

)
#    top_p = CFG.top_p,
#    repetition_penalty = CFG.repetition_penalty,

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

CPU times: user 2.44 ms, sys: 0 ns, total: 2.44 ms
Wall time: 2.2 ms


In [16]:
prompt_template = """
You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: {question}

Context: {context}

Answer:
"""

PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [17]:
retriever = vectordb.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": CFG.k}
)

In [18]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = False,
    verbose = False
)

In [19]:
%%time

import textwrap
import time

def process_llm_response(llm_response):
    ans = llm_response['result']
    
    # Construct the sources_used string with IDs
    #sources_used = ' \n'.join(
    #    [
    #        f"ID: {source.metadata.get('id', 'unknown')}"
    #        for source in llm_response['source_documents']
    #    ]
    #)
    
    #ans = ans + '\n\nSources: \n' + sources_used

    pattern = ""
    index = ans.find(pattern)
    if index != -1:
        ans = ans[index + len(pattern):]    
    
    return ans.strip()

def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 13.8 µs


### GAP-Analyse

## Szenario 1

In [23]:
query = "How can I make sure only the bare minimum of services and devices are connected to my network?"
result = llm_ans(query)
clear_output()
print(result)

You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: How can I make sure only the bare minimum of services and devices are connected to my network?

Context: ID: 4.4.4; Title: Secure Remote Access; Statement: ISO/IEC 27002:2013 / 27019:2017: 9.1.2, 9.4.1, 9.4.2 • a) It shall be possible to administrate, maintain and configure all components via an out-of-band network, e. g. via local access, a serial port, a network or direct control of the input devices (KVM). • b) Any remote access shall take place via centrally administrated access servers that are under contro

Expected: Nmap

## Szenario 2

In [21]:
query = "To enhance my companies security, I need to make sure there are no current vulnerabilities. How would I do this?"
result = llm_ans(query)
clear_output()
print(result)

You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: To enhance my companies security, I need to make sure there are no current vulnerabilities. How would I do this?

Context: ID: 4.7.2; Title: Secure Update Processes; Statement: ISO/IEC 27002:2013 / 27019:2017: 12.5.1, 14.2.2, 14.2.3, 14.2.7, 14.2.9 The provision and installation of updates, extensions and patches needs to occur according to a defined process and in coordination with the client.; Tool: Name: none; Description: none; Reason: none; Applicability: none

ID: 4.1.10; Title: Documentation Requirements;

Expected: OpenVAS

## Szenario 3

In [24]:
query = "How can I test if my system is resistant against common attacks like the DoS-attacks?"
result = llm_ans(query)
clear_output()
print(result)

You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: How can I test if my system is resistant against common attacks like the DoS-attacks?

Context: ID: 4.8.1; Title: Back-up: Concept, Method, Documentation, Testing; Statement: ISO/IEC 27002:2013 / 27019:2017: 12.1.1, 12.3.1 Documented and tested procedures for data back-up and recovery of the individual components resp. the entire system and the respective configurations shall exist. There shall be the possibility for central back-up of the configuration parameters of distributed components. After relevant system

Expected: hping-three

### ICS-Security-Kompendium

## Szenario 4

In [25]:
query = "How do I make sure that only encrypted data is transmitted in my network?"
result = llm_ans(query)
clear_output()
print(result)

You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: How do I make sure that only encrypted data is transmitted in my network?

Context: ID: 4.1.5; Title: Encryption of Sensitive Data; Statement: ISO/IEC 27002:2013 / 27019:2017: 10.1.1, 12.4.2, 13.1.2, 18.1.3, 18.1.4 Confidential data shall only be stored resp. transmitted encrypted.; Tool: Name: Wireshark; Description: Wireshark is a widely-used, open-source network protocol analyzer that allows users to capture and interactively analyze data traffic flowing through a network in real-time. It provides deep visibi

Expected: Wireshark / OpenSSL

## Szenario 5

In [27]:
query = "I am not sure if every device that is communicating in my network, is in the networkplan. How can I ensure I am not missing any devices in my networkplan?"
result = llm_ans(query)
clear_output()
print(result)

You are an expert in cybersecurity. Your task is to respond exclusively to the question provided and nothing else.
Given the question and context, evaluate all relevant requirements. 
Based on this evaluation, create a detailed test plan outlining how these requirements can be tested using the specified tools.
Make sure to include details from the requirement such as the title, ID, statement, and available tools.
If you don't know the answer, don't make up an answer.

Question: I am not sure if every device that is communicating in my network, is in the networkplan. How can I ensure I am not missing any devices in my networkplan?

Context: ID: 4.4.3; Title: Documentation of Network Structure and Configuration; Statement: ISO/IEC 27002:2013 / 27019:2017: 8.1.1 The following shall be documented: network design and configuration; all physical, virtual and logical network connections and the employed protocols, IP addresses and ports; and any network perimeters that are part of the system 

Expected: Nmap / Wireshark