In [7]:
import os
import json
from langchain_core.documents import Document
from mem0 import MemoryClient
from utils.utils import *

In [8]:
# Get the API key from environment variables
os.environ["MEM0_API_KEY"] = os.environ.get("MEM0_API_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

mem0_client = MemoryClient()

In [9]:
mem0_collection: str = "mitre_attack"

In [10]:
def initialize_attack_db(attack_data_path):
    """Initialize the MITRE ATT&CK vector database"""

    print("Loading MITRE ATT&CK data...")

    with open(attack_data_path, "r") as f:

        attack_data = json.load(f)

    documents = []
    ids = []

    for i, technique in enumerate(attack_data.get("objects", [])):

        if technique.get("type") == "attack-pattern":

            technique_id = technique.get("external_references", [{}])[0].get(
                "external_id", ""
            )

            if not technique_id.startswith("T"):
                continue

            name = technique.get("name", "")
            description = technique.get("description", "")
            tactics = [
                phase["phase_name"] for phase in technique.get("kill_chain_phases", [])
            ]
            platforms = technique.get("x_mitre_platforms", [])
            detection = technique.get("x_mitre_detection", "")
            data_sources = technique.get("x_mitre_data_sources", [])

            name = clean_text(name)
            description = clean_text(description)
            detection = clean_text(detection)

            content = f"""
                # {technique_id}: {name}

                ## Description
                {description}

                ## Tactics
                {', '.join(tactics)}

                ## Platforms
                {', '.join(platforms)}

                ## Detection
                {detection}

                ## Data Sources
                {', '.join(data_sources)}
                """

            doc = Document(
                page_content=content,
                metadata={
                    "type": "MITRE_ATTACK",
                    "technique_id": technique_id,
                    "name": name,
                    "tactics": tactics,
                },
                id=str(i),
            )

            documents.append(doc)
            ids.append(str(i))

    return [documents, ids]

In [None]:
from services.ollama_service import *

# Initialize and get documents


documents, ids = initialize_attack_db("mitre_data/enterprise-attack.json")


print(f"Processed {len(documents)} MITRE ATT&CK techniques")



# Generate embeddings


print("Generating embeddings...")


embeddings = generate_embeddings(documents)

Loading MITRE ATT&CK data...
Processed 799 MITRE ATT&CK techniques
Generating embeddings...


In [12]:
from services.qdrant_service import *
# Check if the environment variable exists
from qdrant_client import QdrantClient

qdrant_url = os.environ.get("QDRANT_URL")
qdrant_api_key = os.environ.get("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)


# # Store in Qdrant


print("Storing in Qdrant...")


ids = [int(x) for x in ids]  # Convert IDs to integers


store_in_qdrant(documents, embeddings, ids, qdrant_client, 'mitre-attack-nomic')



print("MITRE ATT&CK vector database initialized successfully!")

Storing in Qdrant...
Successfully stored 799 MITRE ATT&CK techniques in Qdrant
MITRE ATT&CK vector database initialized successfully!
