# 1. INSERT DATA INTO MONGO DB
## Converting the raw data into a JSON format and then inserting it into the MongoDB database


In [1]:
from dotenv import load_dotenv
import os
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pathlib import Path
import json

print("Starting the script")
env_path = Path('.env')
load_dotenv()
uri = os.getenv("MONGO_URI")
print(uri)


client = MongoClient(uri, server_api=ServerApi('1'))
db = client["pinewheel"]
collection = db["network_scans"] 
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
    
    # Load my JSON data
    with open('data.json', 'r') as f:
        data = json.load(f)

    # Insert data into MongoDB
    collection.insert_one(data)
except Exception as e:
    print(e)
    
    

Starting the script
mongodb+srv://nimishkumar0305:nimish0305@pinewheel.jtw34.mongodb.net/?retryWrites=true&w=majority&appName=pinewheel
Pinged your deployment. You successfully connected to MongoDB!


# 2. BUILD A GRAPH FROM MONGO DB
## Creating a graph from the data in MongoDB and then storing it in a ChromaDB Vector database


In [39]:
import numpy as np
import networkx as nx
import chromadb
from dotenv import load_dotenv
import os
from pathlib import Path
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import time

# Load environment variables
env_path = Path('.env')
load_dotenv()

# MongoDB URI from .env
uri = os.getenv("MONGO_URI")

# MongoDB connection
client = MongoClient(uri, server_api=ServerApi('1'))
db = client["pinewheel"]
collection = db["network_scans"]

# Initialize Chroma DB client
client_chroma = chromadb.Client()
collection_chroma = client_chroma.get_or_create_collection(
    "scan_graph", 
    embedding_function=chromadb.embedding_functions.SklearnEmbeddingFunction(svd, vectorizer)
)


# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Example corpus to fit the vectorizer (e.g., text from your graph nodes)
sample_corpus = [
    "New scan report detected.",
    "Nmap scan results indicate open ports.",
    "FFUF scan identified hidden directories.",
    "Network vulnerability detected in scan."
]
# Fit the vectorizer on the corpus
vectorizer.fit(sample_corpus)  # Fitting the vectorizer

# Initialize TruncatedSVD
num_features = len(vectorizer.get_feature_names_out())
n_components = min(300, num_features)
svd = TruncatedSVD(n_components=n_components)

# Fit the SVD model with the entire TF-IDF matrix
tfidf_matrix = vectorizer.transform(sample_corpus)  # Transform the sample corpus
svd.fit(tfidf_matrix)  # Fit SVD on the transformed TF-IDF matrix

# Initialize NetworkX graph
graph = nx.Graph()

# Rate limit parameters for the Free Plan
MAX_RPM = 3  # Requests per minute
request_counter = 0
last_request_time = time.time()

# Helper function to manage rate limits
def rate_limit_guard():
    global request_counter, last_request_time
    current_time = time.time()
    time_diff = current_time - last_request_time

    if time_diff < 60:
        if request_counter >= MAX_RPM:
            time_to_wait = 60 - time_diff
            print(f"Rate limit hit. Sleeping for {time_to_wait:.2f} seconds...")
            time.sleep(time_to_wait)
            request_counter = 0

    request_counter += 1
    last_request_time = current_time

# Helper function to get text embeddings using TF-IDF and SVD
def get_text_embedding(text):
    rate_limit_guard()
    
    # Transform the text using the fitted TF-IDF vectorizer
    tfidf_matrix = vectorizer.transform([text])
    
    # Transform the TF-IDF matrix using the already fitted SVD model
    embedding = svd.transform(tfidf_matrix)  # Apply the fitted SVD to reduce dimensionality
    
    # Flatten the 2D array to a 1D array for storage
    return embedding.flatten()

# Graph and MongoDB data ingestion functions remain unchanged
def add_or_update_node(graph, node_data):
    node_id = node_data.get("host") or node_data.get("url")
    if graph.has_node(node_id):
        print(f"Node {node_id} exists, updating...")
        graph.nodes[node_id].update(node_data)
    else:
        print(f"Node {node_id} does not exist, creating...")
        graph.add_node(node_id, **node_data)

def add_or_update_edge(graph, node1, node2, edge_data):
    if graph.has_edge(node1, node2):
        print(f"Edge between {node1} and {node2} exists, updating...")
        graph[node1][node2].update(edge_data)
    else:
        print(f"Edge between {node1} and {node2} does not exist, creating...")
        graph.add_edge(node1, node2, **edge_data)

def ingest_data_from_mongo():
    scans = collection.find()
    for scan in scans:
        if "nmap_scan_1" in scan:
            nmap_data = scan["nmap_scan_1"]
            add_or_update_node(graph, {
                "host": nmap_data["host"],
                "status": nmap_data["status"],
                "latency": nmap_data["latency"],
                "ports": nmap_data["ports"]
            })
            for port in nmap_data["ports"]:
                add_or_update_edge(graph, nmap_data["host"], port["port"], {
                    "protocol": port["protocol"],
                    "state": port["state"],
                    "service": port["service"]
                })
        
        if "ffuf_scan" in scan:
            ffuf_data = scan["ffuf_scan"]
            add_or_update_node(graph, {
                "url": ffuf_data["url"],
                "status_filter": ffuf_data["status_filter"],
                "results": ffuf_data["results"]
            })
            for result in ffuf_data["results"]:
                add_or_update_edge(graph, ffuf_data["url"], result["url"], {
                    "status": result["status"],
                    "size": result["size"],
                    "duration": result["duration"]
                })

def handle_text_data(text):
    # Get the text embedding
    embedding = get_text_embedding(text)
    
    # Store the text node in Chroma DB
    collection_chroma.add(
        ids=[text],  # Pass the `id` as a separate string argument
        embeddings=[embedding],  # Pass the embedding as a list
        metadatas=[{"text": text}]  # Metadata should be in a list
    )
    
    # Return stored text data for graph usage
    return {"text": text, "embedding": embedding}

def check_and_update_existing_nodes(new_data, node_id):
    if node_id in graph:
        existing_node_data = graph.nodes[node_id]
        for key, value in new_data.items():
            if key not in existing_node_data or existing_node_data[key] != value:
                existing_node_data[key] = value
        print(f"Node {node_id} updated.")
    else:
        graph.add_node(node_id, **new_data)
        print(f"Node {node_id} added.")

def update_graph_with_new_data(scan_data):
    for scan in scan_data:
        if "nmap_scan_1" in scan:
            nmap_data = scan["nmap_scan_1"]
            node_id = nmap_data["host"]
            new_data = {
                "status": nmap_data["status"],
                "latency": nmap_data["latency"],
                "ports": nmap_data["ports"]
            }
            check_and_update_existing_nodes(new_data, node_id)

def build_and_update_graph():
    ingest_data_from_mongo()
    text_data = handle_text_data("New scan report detected")
    scan_data = [
        {"nmap_scan_1": {"host": "10.10.10.1", "status": "up", "latency": 20, "ports": [{"port": 80, "protocol": "tcp", "state": "open", "service": "http"}]}},
        {"ffuf_scan": {"url": "http://example.com", "status_filter": "200", "results": [{"url": "/admin", "status": 200, "size": 512, "duration": 1.5}]}}
    ]
    update_graph_with_new_data(scan_data)

# Execute graph build and update process
build_and_update_graph()

AttributeError: module 'chromadb' has no attribute 'embedding_functions'

# SAVING GRAPH TO MongoDB
## Store nodes in a nodes collection and edges in an edges collection.


In [13]:
node_collection = db["graph_nodes"]  # Collection for nodes
edge_collection = db["graph_edges"]  # Collection for edges

# Check if collections are created
print("Available collections:", db.list_collection_names())

Available collections: ['network_scans']


In [14]:
# Saving the graph nodes and edges to MongoDB
def save_graph_to_mongo():
    for node_id, node_data in graph.nodes(data=True):
        node_collection.update_one(
            {"node_id": node_id},  # Use node_id as a unique identifier
            {"$set": {"node_id": node_id, **node_data}},  # Save the node's data
            upsert=True  # If node doesn't exist, insert it
        )
    
    for node1, node2, edge_data in graph.edges(data=True):
        edge_collection.update_one(
            {"source": node1, "target": node2},  # Use source and target as identifiers
            {"$set": {"source": node1, "target": node2, **edge_data}},  # Save the edge's data
            upsert=True  # If edge doesn't exist, insert it
        )
    print("Graph saved to MongoDB")

# Save the graph to MongoDB
save_graph_to_mongo()
print("SAVED GRAPH TO MONGO")

Graph saved to MongoDB
SAVED GRAPH TO MONGO


In [40]:
# Verifying the saved nodes and edges in MongoDB
def verify_saved_graph():
    # Fetch all nodes from MongoDB
    nodes = node_collection.find()
    print("Nodes in MongoDB:")
    for node in nodes:
        print(node)

    # Fetch all edges from MongoDB
    edges = edge_collection.find()
    print("\nEdges in MongoDB:")
    for edge in edges:
        print(edge)

# Verify the saved graph
verify_saved_graph()

Nodes in MongoDB:
{'_id': ObjectId('673362f400d1c230affbf3b6'), 'node_id': '10.10.11.248', 'host': '10.10.11.248', 'latency': '0.057s', 'ports': [{'port': 22, 'protocol': 'tcp', 'state': 'open', 'service': 'ssh', 'reason': 'syn-ack'}, {'port': 80, 'protocol': 'tcp', 'state': 'open', 'service': 'http', 'reason': 'syn-ack'}, {'port': 389, 'protocol': 'tcp', 'state': 'open', 'service': 'ldap', 'reason': 'syn-ack'}, {'port': 443, 'protocol': 'tcp', 'state': 'open', 'service': 'https', 'reason': 'syn-ack'}, {'port': 5667, 'protocol': 'tcp', 'state': 'open', 'service': 'unknown', 'reason': 'syn-ack'}], 'status': 'up'}
{'_id': ObjectId('673362f400d1c230affbf3fc'), 'node_id': 22}
{'_id': ObjectId('673362f400d1c230affbf48c'), 'node_id': 80}
{'_id': ObjectId('673362f400d1c230affbf4bc'), 'node_id': 389}
{'_id': ObjectId('673362f400d1c230affbf4ea'), 'node_id': 443}
{'_id': ObjectId('673362f400d1c230affbf503'), 'node_id': 5667}
{'_id': ObjectId('673362f400d1c230affbf510'), 'node_id': 'https://nagio

# LOAD GRAPH FROM MongoDB

In [41]:
# Loading the graph from MongoDB
def load_graph_from_mongo():
    # Initialize a new NetworkX graph
    graph = nx.Graph()
    
    # Load nodes from MongoDB
    nodes = node_collection.find()
    for node in nodes:
        graph.add_node(node['node_id'], **{key: value for key, value in node.items() if key != 'node_id'})
    
    # Load edges from MongoDB
    edges = edge_collection.find()
    for edge in edges:
        graph.add_edge(edge['source'], edge['target'], **{key: value for key, value in edge.items() if key not in ['source', 'target']})
    
    print("Graph loaded from MongoDB")
    return graph

# Example usage to load graph from MongoDB
graph = load_graph_from_mongo()

Graph loaded from MongoDB


# INFERENCE PIPELINE


In [45]:
import numpy as np
import networkx as nx
import chromadb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time

# Initialize Chroma DB client
client_chroma = chromadb.Client()
collection_chroma = client_chroma.get_collection("scan_graph")

# Initialize NetworkX graph (assuming you've already created this earlier)
graph = nx.Graph()

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Function to fit the vectorizer on a corpus (or graph nodes' text data)
def fit_vectorizer_on_corpus(corpus):
    vectorizer.fit(corpus)

# Function to embed text data using TF-IDF
def get_text_embedding(text):
    # Transform the text using TF-IDF vectorizer
    return vectorizer.transform([text]).toarray()

# Function to create a text representation for each node
def create_node_text_representation(node):
    # Combine node_id and port info into a single string for the node
    node_id = node.get("node_id", "")
    ports = node.get("ports", [])
    port_info = ", ".join([f"{port['protocol']} port {port['port']} (state: {port['state']})" for port in ports])
    
    # Create a text representation by combining node_id and port info
    return f"Node {node_id}: {port_info}"

# Function to extract relevant context from the graph based on cosine similarity
def extract_relevant_context(query, top_n=3):
    # Embed the query using TF-IDF
    query_embedding = get_text_embedding(query)

    # Get embeddings of all nodes in the graph (we assume node data contains text)
    node_embeddings = {}
    for node in graph.nodes(data=True):
        node_text = create_node_text_representation(node[1])  # Create text from node attributes
        node_embeddings[node[0]] = get_text_embedding(node_text)

    # Calculate cosine similarities between the query and each node's text embedding
    similarities = {}
    for node_id, embedding in node_embeddings.items():
        similarity = cosine_similarity(query_embedding, embedding)
        similarities[node_id] = similarity[0][0]

    # Sort nodes by similarity and pick the top N most similar nodes
    top_nodes = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Extract the text of the most similar nodes as the context
    context = " ".join([create_node_text_representation(graph.nodes[node_id]) for node_id, _ in top_nodes])

    return context

# Function to generate the answer based on the relevant context from the graph
def generate_answer_with_graph(query):
    # Step 1: Extract relevant context using the graph
    context = extract_relevant_context(query)

    # Step 2: Use the context to generate an answer
    # (In this case, we're just returning the context directly)
    answer = context

    return answer

# Example Query
query = "What ports are open on the target server?"

# Sample corpus of text from which to fit the vectorizer
sample_corpus = [
    "New scan report detected.",
    "Nmap scan results indicate open ports.",
    "FFUF scan identified hidden directories.",
    "Network vulnerability detected in scan."
    # Add more sample text to fit the vectorizer
]

# Fit the vectorizer on the sample corpus
fit_vectorizer_on_corpus(sample_corpus)

# Example of adding nodes to the graph with meaningful text data
graph.add_node(1, node_id="10.10.11.248", ports=[{'port': 22, 'protocol': 'tcp', 'state': 'open', 'service': 'ssh'}, {'port': 80, 'protocol': 'tcp', 'state': 'open', 'service': 'http'}])
graph.add_node(2, node_id="22", ports=[{'port': 22, 'protocol': 'tcp', 'state': 'open', 'service': 'ssh'}])
graph.add_node(3, node_id="80", ports=[{'port': 80, 'protocol': 'tcp', 'state': 'open', 'service': 'http'}])
graph.add_node(4, node_id="443", ports=[{'port': 443, 'protocol': 'tcp', 'state': 'open', 'service': 'https'}])

# Benchmark the time taken to generate an answer
start_time = time.time()

# Get the answer from the graph-based RAG pipeline
answer = generate_answer_with_graph(query)

end_time = time.time()

# Time taken for inference
inference_time = end_time - start_time

# Print results
print(f"Generated Answer: {answer}")
print(f"Inference Time: {inference_time:.4f} seconds")

Generated Answer: Node 10.10.11.248: tcp port 22 (state: open), tcp port 80 (state: open) Node 22: tcp port 22 (state: open) Node 80: tcp port 80 (state: open)
Inference Time: 0.0024 seconds
