In [19]:
!pip freeze

aiofiles==24.1.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
albucore==0.0.13
albumentations==1.4.10
annotated-types==0.7.0
anyascii==0.3.2
anyio==4.7.0
astor==0.8.1
asttokens==3.0.0
attrs==25.1.0
babel==2.17.0
backoff==2.2.1
bcrypt==5.0.0
beautifulsoup4==4.13.1
bibtexparser==2.0.0b8
black==25.1.0
Brotli==1.1.0
build==1.3.0
cachetools==5.5.2
cattrs==24.1.2
cerebras_cloud_sdk==1.50.1
certifi==2024.12.14
cffi==1.17.1
charset-normalizer==3.4.1
chromadb==1.3.4
click==8.1.8
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.3.1
coremltools==8.2
courlan==1.3.2
cssselect2==0.7.0
cycler==0.12.1
Cython==3.0.12
dataclasses-json==0.6.7
dateparser==1.2.0
debugpy==1.8.11
decorator==5.1.1
defusedxml==0.7.1
distro==1.9.0
durationpy==0.10
executing==2.1.0
filelock==3.17.0
fire==0.7.0
flatbuffers==25.2.10
fonttools==4.56.0
frozenlist==1.5.0
fsspec==2025.2.0
git-filter-repo==2.47.0
google-api-core==2.24.2
google-auth==2.40.1
google-cloud-core==2.4.3
google-cloud-storage==3.1.0
google-cloud-visio

In [4]:
import logging
import pickle
from pathlib import Path

import torch
from langchain_classic.retrievers import EnsembleRetriever
from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
from langchain_chroma import Chroma
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.retrievers import BM25Retriever
from langchain_huggingface import HuggingFaceEmbeddings

In [5]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [6]:
def load_split_documents(splits_path: Path) -> list:
    """Load pre-split documents from pickle file for BM25 retriever."""
    logger.info("Loading split documents from %s", splits_path)
    with splits_path.open("rb") as f:
        return pickle.load(f)

In [7]:
def create_dense_retriever(
    persist_dir: Path,
    collection_name: str,
    model_name: str = "Alibaba-NLP/gte-modernbert-base",
    k: int = 10,
):
    """Load existing Chroma vectorstore and return as retriever."""
    logger.info("Loading Chroma vectorstore from %s", persist_dir)

    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu", "trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},
    )

    vectorstore = Chroma(
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=str(persist_dir),
    )

    logger.info("Vectorstore loaded with %d documents", vectorstore._collection.count())
    return vectorstore.as_retriever(search_kwargs={"k": k})

In [8]:
def create_bm25_retriever(documents: list, k: int = 10) -> BM25Retriever:
    """Create BM25 retriever from documents."""
    logger.info("Creating BM25 retriever from %d documents", len(documents))
    return BM25Retriever.from_documents(documents, k=k)

In [9]:
def create_hybrid_retriever(
    dense_retriever,
    sparse_retriever,
    weights: tuple[float, float] = (0.5, 0.5),
) -> EnsembleRetriever:
    """Create ensemble retriever combining dense and sparse retrievers."""
    logger.info("Creating hybrid retriever with weights: dense=%.2f, sparse=%.2f", weights[0], weights[1])
    return EnsembleRetriever(
        retrievers=[dense_retriever, sparse_retriever],
        weights=list(weights),
    )

In [12]:
def create_reranked_retriever(
    base_retriever,
    reranker_model: str = "BAAI/bge-reranker-v2-m3",
    top_n: int = 5,
) -> ContextualCompressionRetriever:
    """Wrap retriever with cross-encoder reranker."""
    logger.info("Creating reranker with model %s, top_n=%d", reranker_model, top_n)

    cross_encoder = HuggingFaceCrossEncoder(model_name=reranker_model)
    compressor = CrossEncoderReranker(model=cross_encoder, top_n=top_n)

    return ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=base_retriever,
    )

In [10]:
def retrieve(retriever, query: str) -> list:
    """Execute retrieval and return documents."""
    logger.info("Retrieving documents for query: %s", query[:50])
    return retriever.invoke(query)

In [13]:
chroma_dir = Path("./chroma_db")
splits_file = Path("./split_documents.pkl")
collection_name = "malpedia_vectors"

dense_k = 10
sparse_k = 10
rerank_top_n = 5
dense_weight = 0.5
sparse_weight = 0.5

if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")
    logger.info("Using CUDA device")

split_docs = load_split_documents(splits_file)

dense_retriever = create_dense_retriever(
    persist_dir=chroma_dir,
    collection_name=collection_name,
    k=dense_k,
)

sparse_retriever = create_bm25_retriever(split_docs, k=sparse_k)

hybrid_retriever = create_hybrid_retriever(
    dense_retriever,
    sparse_retriever,
    weights=(dense_weight, sparse_weight),
)

final_retriever = create_reranked_retriever(
    hybrid_retriever,
    top_n=rerank_top_n,
)

query = "What techniques does Emotet use for persistence?"
results = retrieve(final_retriever, query)

print(f"\n{'='*80}")
print(f"Query: {query}")
print(f"Retrieved {len(results)} documents after reranking")
print(f"{'='*80}\n")

for i, doc in enumerate(results, 1):
    print(f"[{i}] Source: {doc.metadata.get('title', 'Unknown')}")
    print(f"    URL: {doc.metadata.get('url', 'N/A')}")
    print(f"    Content preview: {doc.page_content[:200]}...")
    print()



2025-12-30 15:43:34,102 - INFO - Using CUDA device
2025-12-30 15:43:34,103 - INFO - Loading split documents from split_documents.pkl
2025-12-30 15:43:34,428 - INFO - Loading Chroma vectorstore from chroma_db
2025-12-30 15:43:34,429 - INFO - Load pretrained SentenceTransformer: Alibaba-NLP/gte-modernbert-base
2025-12-30 15:43:38,242 - INFO - Vectorstore loaded with 42799 documents
2025-12-30 15:43:38,243 - INFO - Creating BM25 retriever from 42799 documents
2025-12-30 15:43:41,196 - INFO - Creating hybrid retriever with weights: dense=0.50, sparse=0.50
2025-12-30 15:43:41,288 - INFO - Creating reranker with model BAAI/bge-reranker-v2-m3, top_n=5
2025-12-30 15:43:42,328 - INFO - Use pytorch device: cuda:0
2025-12-30 15:43:42,719 - INFO - Retrieving documents for query: What techniques does Emotet use for persistence?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What techniques does Emotet use for persistence?
Retrieved 5 documents after reranking

[1] Source: {Emotet launches major new spam campaign}
    URL: https://www.welivesecurity.com/2018/11/09/emotet-launches-major-new-spam-campaign/
    Content preview: week after adding a new email content harvesting module, and following a period of low activity, the malicious actors behind Emotet have launched a new, large-scale spam campaign. What is Emotet
Emote...

[2] Source: {How to Respond to Emotet Infection (FAQ)}
    URL: https://blogs.jpcert.or.jp/en/2019/12/emotetfaq.html
    Content preview: 3. Check auto-start settings
Emotet has several methods for maintaining persistence such as setting auto-start registry keys, saving the payload into Startup folder, etc. Check the following settings ...

[3] Source: {Awaiting the Inevitable Return of Emotet}
    URL: https://www.hornetsecurity.com/en/security-information/awaiting-the-inevitable-return-of-emotet/
    Content preview: Please 