In [14]:
import os
import json
from langchain_core.documents import Document
from mem0 import MemoryClient
from utils.utils import *

In [15]:
# Get the API key from environment variables
os.environ["MEM0_API_KEY"] = os.environ.get("MEM0_API_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

mem0_client = MemoryClient()

In [16]:
mem0_collection: str = "mitre_attack"

In [23]:
def initialize_attack_db(attack_data_path):
    """Initialize the MITRE ATT&CK vector database"""

    print("Loading MITRE ATT&CK data...")

    # Specify encoding='utf-8' to handle potential unicode characters
    with open(attack_data_path, "r", encoding='utf-8') as f:

        attack_data = json.load(f)

    documents = []
    ids = []

    for i, technique in enumerate(attack_data.get("objects", [])):
        if technique.get("type") == "attack-pattern":

            technique_id = technique.get("external_references", [{}])[0].get(
                "external_id", ""
            )

            if not technique_id.startswith("T"):
                continue

            name = technique.get("name", "")
            description = technique.get("description", "")
            tactics = [
                phase["phase_name"] for phase in technique.get("kill_chain_phases", [])
            ]
            platforms = technique.get("x_mitre_platforms", [])
            detection = technique.get("x_mitre_detection", "")
            data_sources = technique.get("x_mitre_data_sources", [])

            name = clean_text(name)
            description = clean_text(description)
            detection = clean_text(detection)

            content = f"""
                # {technique_id}: {name}

                ## Description
                {description}

                ## Tactics
                {', '.join(tactics)}

                ## Platforms
                {', '.join(platforms)}

                ## Detection
                {detection}

                ## Data Sources
                {', '.join(data_sources)}
                """

            doc = Document(
                page_content=content,
                metadata={
                    "type": "MITRE_ATTACK",
                    "technique_id": technique_id,
                    "name": name,
                    "tactics": tactics,
                },
                id=str(i+799),
            )

            documents.append(doc)
            ids.append(str(i+799))

    return [documents, ids]

In [24]:
from services.ollama_service import *

# Initialize and get documents


documents, ids = initialize_attack_db("mitre_data/ics-attack.json")


print(f"Processed {len(documents)} MITRE ATT&CK techniques")



# Generate embeddings


print("Generating embeddings...")


embeddings = generate_embeddings(documents)

Loading MITRE ATT&CK data...
Processed 95 MITRE ATT&CK techniques
Generating embeddings...


In [19]:
print(ids)

['893', '894', '895', '896', '897', '898', '899', '900', '901', '902', '903', '904', '905', '906', '907', '908', '909', '910', '911', '912', '913', '914', '915', '916', '917', '918', '919', '920', '921', '922', '923', '924', '925', '926', '927', '928', '929', '930', '931', '932', '933', '934', '935', '936', '937', '938', '939', '940', '941', '942', '943', '944', '945', '946', '947', '948', '949', '950', '951', '952', '953', '954', '955', '956', '957', '958', '959', '960', '961', '962', '963', '964', '965', '966', '967', '968', '969', '970', '971', '972', '973', '974', '975', '976', '977', '978', '979', '980', '981', '982', '983', '984', '985', '986', '987']


In [25]:
from services.qdrant_service import *
# Check if the environment variable exists
from qdrant_client import QdrantClient

qdrant_url = os.environ.get("QDRANT_URL")
qdrant_api_key = os.environ.get("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)


# # Store in Qdrant


print("Storing in Qdrant...")


ids = [int(x) for x in ids]  # Convert IDs to integers


store_in_qdrant(documents, embeddings, ids, qdrant_client, 'mitre-attack-nomic')



print("MITRE ATT&CK vector database initialized successfully!")

Storing in Qdrant...
Successfully stored 95 MITRE ATT&CK techniques in Qdrant
MITRE ATT&CK vector database initialized successfully!
