In [2]:

# Multi-Level RAG with dataset-aware routing

import os
import time
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import hdbscan
import pickle

DATASET_FOLDER = "/workspace/FCAPS/dataset/Cleaned datasets"
RAG_FOLDER = "/workspace/FCAPS/RAG"
os.makedirs(RAG_FOLDER, exist_ok=True)

# Load embedding model
def select_embedding_model(model_name):
    return SentenceTransformer(f"sentence-transformers/{model_name}")

model = select_embedding_model("all-MiniLM-L6-v2")
embedding_dim = 384
index = faiss.IndexHNSWFlat(embedding_dim, 32)

metadata = []
embeddings_all = []
log_mapping = {}

# Dataset-level encoding
label_encoder = LabelEncoder()
dataset_names = [f for f in os.listdir(DATASET_FOLDER) if f.endswith(".csv")]
dataset_labels = label_encoder.fit_transform(dataset_names)
dataset_name_to_label = dict(zip(dataset_names, dataset_labels))

start_idx = 0

# Ingest logs and encode 
for file in dataset_names:
    dataset_path = os.path.join(DATASET_FOLDER, file)
    df = pd.read_csv(dataset_path)
    log_texts = df['Content'].astype(str).tolist()

    embeddings = model.encode(log_texts, convert_to_numpy=True)
    embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

    index.add(embeddings)
    embeddings_all.append(embeddings)

    for i, log in enumerate(log_texts):
        faiss_idx = start_idx + i
        log_mapping[faiss_idx] = (file, i, log)
        metadata.append((file, faiss_idx, dataset_name_to_label[file]))

    start_idx += len(log_texts)
    print(f"Processed {file}: {len(log_texts)} logs")

# Save FAISS index
faiss.write_index(index, os.path.join(RAG_FOLDER, "logs.index"))
pickle.dump(log_mapping, open(os.path.join(RAG_FOLDER, "log_mapping.pkl"), "wb"))

# Save metadata 
metadata_df = pd.DataFrame(metadata, columns=["dataset", "faiss_index", "dataset_id"])
metadata_df.to_csv(os.path.join(RAG_FOLDER, "metadata.csv"), index=False)

# Cluster logs using HDBSCAN 
all_embeddings = np.vstack(embeddings_all)
clusterer = hdbscan.HDBSCAN(min_cluster_size=50, metric="euclidean")
cluster_labels = clusterer.fit_predict(all_embeddings)
valid_mask = cluster_labels != -1
valid_embeddings = all_embeddings[valid_mask]
valid_clusters = cluster_labels[valid_mask]

# Save cluster metadata 
metadata_df = metadata_df.iloc[:len(cluster_labels)].copy()
metadata_df['cluster'] = cluster_labels
metadata_df.to_csv(os.path.join(RAG_FOLDER, "metadata_with_clusters.csv"), index=False)

# Compute and save centroids
centroids = []
for cluster_id in np.unique(valid_clusters):
    centroid = valid_embeddings[valid_clusters == cluster_id].mean(axis=0)
    centroids.append(centroid)
np.save(os.path.join(RAG_FOLDER, "cluster_centroids.npy"), np.vstack(centroids))
print("Cluster centroids computed and saved.")


Processed Android_2k_cleaned.csv: 2000 logs
Processed Hadoop_2k_cleaned.csv: 2000 logs
Processed HDFS_2k_cleaned.csv: 2000 logs
Processed Linux_2k_cleaned.csv: 1849 logs
Processed Openstack_2k_cleaned.csv: 2000 logs




Cluster centroids computed and saved.
