In [None]:
import json
import pandas as pd
from tqdm import tqdm
from io import StringIO
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from google.cloud import storage

# Set up GCS client
storage_client = storage.Client()
bucket_name = "arize_rag"  # Change to your actual bucket name
folder_name = "netflow_data/"  # Folder inside the bucket where CSVs are stored
output_faiss_dir = "faiss_network_logs"
OUTPUT_FILE = "processed_network_logs.json"

# Define fields we care about
COLUMNS = ["Time", "Duration", "SrcDevice", "DstDevice", "Protocol", "SrcPort", "DstPort", 
           "SrcPackets", "DstPackets", "SrcBytes", "DstBytes"]

# List all CSV files in GCS
bucket = storage_client.bucket(bucket_name)
csv_file = "netflow_data/netflow_day-02.csv"  # Only process this file
blob = bucket.blob(csv_file)

print(f"📥 Downloading {csv_file} from GCS...")
csv_content = blob.download_as_text()

# Load only the first 100K rows (instead of all rows)
df = pd.read_csv(StringIO(csv_content), names=COLUMNS, nrows=100000)
print(f"Loaded DataFrame with {len(df)} rows (limited to 100K)")

# Convert each row into a structured JSON entry
all_logs = []
for _, row in df.iterrows():
    log_text = (
        f"At time {row['Time']}, device {row['SrcDevice']} initiated a "
        f"{'TCP' if row['Protocol'] == 6 else 'UDP' if row['Protocol'] == 17 else 'other'} connection "
        f"to {row['DstDevice']} on port {row['DstPort']}. "
        f"The source used port {row['SrcPort']} and sent {row['SrcPackets']} packets "
        f"({row['SrcBytes']} bytes), while the destination responded with {row['DstPackets']} packets "
        f"({row['DstBytes']} bytes)."
    )
    all_logs.append({"text": log_text, "metadata": row.to_dict()})

# Save processed logs to JSON
with open(OUTPUT_FILE, "w") as f:
    json.dump(all_logs, f, indent=4)

print(f"Processed {len(all_logs)} log entries and saved to {OUTPUT_FILE}.")


In [None]:
import faiss
import numpy as np
import os
with open(OUTPUT_FILE, "r") as f:
    log_entries = json.load(f)
    
# MAKE SURE U GET UR GPT KEY HERE

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Normalize embeddings (important for cosine similarity search)
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

# Convert text logs to embeddings
texts = [entry["text"] for entry in log_entries]
raw_embeddings = embeddings.embed_documents(texts)
normalized_embeddings = normalize_embeddings(np.array(raw_embeddings))

# Define optimized FAISS index (HNSW for large-scale search)
d = len(normalized_embeddings[0])  # Dimensionality
index = faiss.IndexHNSWFlat(d, 32)  # 32 neighbors

# Tune HNSW parameters for speed vs. accuracy
index.hnsw.efSearch = 64  # Higher = more accurate, lower = faster
index.hnsw.efConstruction = 200  # Balances recall vs. indexing time

# Add embeddings to FAISS index
index.add(normalized_embeddings)

# Save FAISS index
faiss.write_index(index, "faiss_index")

print("FAISS index saved locally. Uploading to GCS...")

# Upload FAISS index to GCS
os.system(f"gsutil cp faiss_index gs://{bucket_name}/")

print("FAISS index uploaded successfully!")


In [None]:

# Load FAISS index
index = faiss.read_index("faiss_index")

# Print FAISS info
print(f"FAISS Index Loaded - {index.ntotal} vectors stored")