# üß≠ Project Title

## Zero-Trust Anomaly Detection in Authentication Logs

### üí° Objective

Detect suspicious login behaviors (e.g., off-hours logins, impossible travel, unusual resource access) using a combination of unsupervised and semi-supervised ML models such as:

Isolation Forest

Autoencoder (deep learning)

One-Class SVM

In [1]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras import layers, models

import warnings
warnings.filterwarnings('ignore')


ImportError: cannot import name 'runtime_version' from 'google.protobuf' (/opt/anaconda3/envs/llms/lib/python3.11/site-packages/google/protobuf/__init__.py)

In [None]:
df = pd.read_csv("/Users/sheltonsimbi/projects/joan-testing/backend/data/auth_logs_raw.csv")

df.head()

In [None]:
df['event_label'].unique()

In [None]:
# ------------------------------------------------------
# üß† Apply Zero-Trust Labeling
# ------------------------------------------------------
# In Zero-Trust, ONLY "normal" events are trusted.
# Everything else is treated as anomalous (1 = anomaly, 0 = normal)
ZERO_TRUST_NORMAL_LABEL = "normal"
zero_trust_anomaly_labels = sorted(
    label for label in df["event_label"].unique() if label != ZERO_TRUST_NORMAL_LABEL
)

df["binary_label"] = (df["event_label"] != ZERO_TRUST_NORMAL_LABEL).astype(int)

print("Zero-Trust anomaly labels:", zero_trust_anomaly_labels)
print(df["binary_label"].value_counts())
print("\n‚úÖ Zero-Trust labels applied: 0 = Normal, 1 = Anomaly")


In [None]:
# Cell 3: Quick overview
df.info()
df.describe(include='all')


In [None]:
df['binary_label'].unique()

In [None]:
df.shape

# EDA

In [None]:
df.isna().sum()

In [None]:
# Cell 5: Event label distribution
plt.figure(figsize=(12,6))
df['binary_label'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of Event Labels')
plt.xticks(rotation=75)
plt.show()


In [None]:
# Cell 6: Bytes transferred distribution
sns.histplot(df['bytes_transferred'], bins=50, kde=True)
plt.title('Distribution of Bytes Transferred')
plt.show()


In [None]:
# Cell 7: Access time distribution
df['hour'] = pd.to_datetime(df['access_time'], format='%H:%M:%S').dt.hour
sns.histplot(df['hour'], bins=24, kde=False)
plt.title('Login Hour Distribution')
plt.xlabel('Hour of Day')
plt.show()


# ‚öôÔ∏è 4. Feature Engineering

We‚Äôll extract numeric and encoded features for modeling.

In [None]:
# Cell 8: Encode categorical columns
categorical_cols = ['user_id', 'device_id', 'ip_address', 'location', 'resource_accessed']
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])


In [None]:
# Cell 9: Convert event_label to binary (normal=0, anomaly=1)
df['is_anomaly'] = df['binary_label']


In [None]:
# Cell 10: Final feature matrix
features = ['user_id', 'device_id', 'ip_address', 'location', 
            'login_success', 'hour', 'resource_accessed', 'bytes_transferred']
X = df[features]
y = df['is_anomaly']


In [None]:
# Cell 11: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Cell 12: Split data (even though anomalies are rare)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

X_train_normal = X_train[y_train == 0]
expected_contamination = min(max(y_train.mean(), 1e-3), 0.49)


# üß† 6. Model 1 ‚Äî Isolation Forest

In [None]:
# Cell 13: Isolation Forest (Zero-Trust friendly)
iso = IsolationForest(contamination=expected_contamination, random_state=42)
iso.fit(X_train_normal)

y_pred_iso = iso.predict(X_test)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)  # Convert (-1 = anomaly) to (1)


In [None]:
# Cell 14: Evaluation
print("Isolation Forest Results:")
print(classification_report(y_test, y_pred_iso, target_names=["normal", "anomaly"]))
sns.heatmap(confusion_matrix(y_test, y_pred_iso), annot=True, fmt='d', cmap='Blues')
plt.title("Isolation Forest Confusion Matrix")
plt.show()


# ü§ñ 7. Model 2 ‚Äî One-Class SVM

In [None]:
# Cell 15: One-Class SVM
svm_nu = min(max(expected_contamination, 0.01), 0.5)
svm = OneClassSVM(kernel='rbf', gamma=0.001, nu=svm_nu)
svm.fit(X_train_normal)  # Train on normal data only

y_pred_svm = svm.predict(X_test)
y_pred_svm = np.where(y_pred_svm == -1, 1, 0)

print("One-Class SVM Results:")
print(classification_report(y_test, y_pred_svm, target_names=["normal", "anomaly"]))
sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt='d', cmap='Greens')
plt.title("One-Class SVM Confusion Matrix")
plt.show()


# üß¨ 8. Model 3 ‚Äî Autoencoder (Deep Learning)

In [None]:
# Cell 16: Autoencoder
input_dim = X_train.shape[1]
encoding_dim = 4

autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(8, activation='relu'),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(input_dim, activation='linear')
])

autoencoder.compile(optimizer='adam', loss='mse')

history = autoencoder.fit(
    X_train_normal,
    X_train_normal,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


In [None]:
# Cell 17: Reconstruction error and threshold
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 95)

y_pred_auto = np.where(mse > threshold, 1, 0)

print("Autoencoder Results:")
print(classification_report(y_test, y_pred_auto, target_names=["normal", "anomaly"]))
sns.heatmap(confusion_matrix(y_test, y_pred_auto), annot=True, fmt='d', cmap='Oranges')
plt.title("Autoencoder Confusion Matrix")
plt.show()


In [None]:
# Cell 18: Compare models
models = ['Isolation Forest', 'One-Class SVM', 'Autoencoder']
aucs = [
    roc_auc_score(y_test, y_pred_iso),
    roc_auc_score(y_test, y_pred_svm),
    roc_auc_score(y_test, y_pred_auto)
]

plt.bar(models, aucs, color=['steelblue','seagreen','orange'])
plt.title("Model AUC Comparison")
plt.ylabel("ROC-AUC Score")
plt.show()


# üß© 19. Fine-Tuning Hyperparameters

We‚Äôll perform grid-search-style tuning for contamination, nu, and the Autoencoder‚Äôs threshold percentile.

In [None]:
# Cell 19: Fine-tuning IsolationForest, OneClassSVM, and Autoencoder threshold

from sklearn.metrics import f1_score

def evaluate_models(contamination_vals, nu_vals, threshold_percents):
    best_results = {}

    for c in contamination_vals:
        iso = IsolationForest(contamination=c, random_state=42)
        iso.fit(X_train_normal)
        preds = np.where(iso.predict(X_test) == -1, 1, 0)
        f1 = f1_score(y_test, preds)
        best_results[f"IsolationForest_c={c}"] = f1

    for n in nu_vals:
        svm = OneClassSVM(kernel='rbf', gamma='scale', nu=n)
        svm.fit(X_train_normal)
        preds = np.where(svm.predict(X_test) == -1, 1, 0)
        f1 = f1_score(y_test, preds)
        best_results[f"OneClassSVM_nu={n}"] = f1

    recon = autoencoder.predict(X_test)
    mse = np.mean(np.power(X_test - recon, 2), axis=1)
    for t in threshold_percents:
        threshold = np.percentile(mse, t)
        preds = np.where(mse > threshold, 1, 0)
        f1 = f1_score(y_test, preds)
        best_results[f"Autoencoder_thr={t}"] = f1

    return pd.DataFrame(list(best_results.items()), columns=["Model_Param", "F1_Score"]).sort_values("F1_Score", ascending=False)

tune_results = evaluate_models(
    contamination_vals=[0.02, 0.05, 0.1],
    nu_vals=[0.01, 0.05, 0.1],
    threshold_percents=[90, 95, 99]
)

tune_results


# üîç 20. SHAP-based Feature Importance (Isolation Forest)

This helps visualize which features drive anomaly decisions.

In [None]:
# --- Recreate trained Isolation Forest model ---
from sklearn.ensemble import IsolationForest

iso_final = IsolationForest(contamination=expected_contamination, random_state=42)
iso_final.fit(X_train_normal)


In [None]:
# --- SHAP Feature Importance for Isolation Forest ---
import shap
import matplotlib.pyplot as plt

# Build explainer on your trained model using anomaly scores
sample_size = min(500, len(X_train_normal))
background = X_train_normal[:sample_size]

explainer = shap.KernelExplainer(iso_final.decision_function, background)
shap_values = explainer.shap_values(X_test[:sample_size])
if isinstance(shap_values, list):
    shap_values = shap_values[0]

# Summarize feature importance
shap.summary_plot(shap_values, X_test[:sample_size], feature_names=features)
plt.title("SHAP Feature Importance - Isolation Forest")
plt.show()


# ‚öôÔ∏è 21. REST API Deployment with FastAPI + Kafka (Simulation)

Below is a lightweight FastAPI service that exposes /predict for real-time anomaly scoring.

In [None]:
# Cell 21: FastAPI REST endpoint (run separately with uvicorn)

from fastapi import FastAPI
import joblib
import numpy as np

app = FastAPI(title="Zero-Trust Anomaly Detector")

# save models
joblib.dump(iso_final, "isoforest_model.pkl")
joblib.dump(scaler, "scaler.pkl")

@app.post("/predict")
def predict(data: dict):
    x = np.array(data["features"]).reshape(1, -1)
    x_scaled = joblib.load("scaler.pkl").transform(x)
    pred = joblib.load("isoforest_model.pkl").predict(x_scaled)
    result = "anomaly" if pred[0] == -1 else "normal"
    return {"prediction": result}



# üìä 23. Zero-Trust Dashboard (Streamlit)

This adds an interactive monitoring dashboard.

Create a new file dashboard/app.py:

In [None]:
# Cell 23 (dashboard/app.py)
import streamlit as st
import pandas as pd
import joblib
import numpy as np

st.title("üîí Zero-Trust Anomaly Detection Dashboard")

iso_model = joblib.load("isoforest_model.pkl")
scaler = joblib.load("scaler.pkl")

uploaded = st.file_uploader("Upload authentication logs (CSV)", type=["csv"])

if uploaded:
    df = pd.read_csv(uploaded)
    X = scaler.transform(df[features])
    preds = np.where(iso_model.predict(X) == -1, "Anomaly", "Normal")
    df["Prediction"] = preds
    st.dataframe(df)
    st.bar_chart(df["Prediction"].value_counts())


# üïí 24. Temporal Sequence Behavior (User Baseline)

We‚Äôll add a user-based average activity tracker to identify deviations.

In [None]:
# Cell 24: Temporal baseline modeling
df['timestamp'] = pd.to_datetime(df['timestamp'])
user_hour_mean = df.groupby(['user_id', 'hour'])['bytes_transferred'].mean().reset_index()
user_hour_mean.rename(columns={'bytes_transferred':'mean_bytes'}, inplace=True)

# Merge baseline back
df = df.merge(user_hour_mean, on=['user_id','hour'], how='left')
df['deviation_ratio'] = (df['bytes_transferred'] / (df['mean_bytes']+1e-5))

sns.histplot(df['deviation_ratio'], bins=50, kde=True)
plt.title("User Activity Deviation Ratio Distribution")
plt.show()


In [None]:
# To run the API:
# conda run -n llms python -m uvicorn anomaly_api:app --host 0.0.0.0 --port 8000

# M:246.U8g6x9-6K
