In [4]:
# ===============================
# PHASE 3 â€” UNSUPERVISED MODEL (Isolation Forest)
# Detect unknown anomalies
# ===============================

import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import joblib

# Load processed datasets
full_data = pd.read_csv("../data/processed/full_graph_data.csv")
normal_data = pd.read_csv("../data/processed/normal_data.csv")

# Extract features only (drop non-numeric columns)
X_normal = normal_data.drop(["txId", "class", "binary_label"], axis=1)
X_full = full_data.drop(["txId", "class", "binary_label"], axis=1)

# Scale features (IF requires scaled input)
scaler = StandardScaler()
X_normal_scaled = scaler.fit_transform(X_normal)
X_full_scaled = scaler.transform(X_full)

# Train Isolation Forest
iso = IsolationForest(
    n_estimators=350,
    contamination=0.05,  
    random_state=42,
    n_jobs=-1
)

print("Training Isolation Forest on normal-only data...")
iso.fit(X_normal_scaled)

# Predict anomaly scores on full dataset
anomaly_scores = -iso.decision_function(X_full_scaled)  # Higher = more anomalous
full_data["anomaly_score"] = anomaly_scores

# Save predictions for fusion
os.makedirs("models", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

full_data.to_csv("../data/processed/if_predictions.csv", index=False)
joblib.dump(iso, "../models/isolation_forest_model.pkl")
joblib.dump(scaler, "../models/if_scaler.pkl")

print("\nIsolation Forest Done!")
print("Scores saved in data/processed/if_predictions.csv")
print("Model saved in models/isolation_forest_model.pkl")


Training Isolation Forest on normal-only data...

Isolation Forest Done!
Scores saved in data/processed/if_predictions.csv
Model saved in models/isolation_forest_model.pkl
