In [None]:
session = get_active_session()

In [None]:
# Core imports
import pandas as pd
import numpy as np

from snowflake.snowpark import Session
from snowflake.snowpark.functions import col

# ML imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt


In [None]:
# 1) Load table from Snowflake
sf_df = session.table("PUBLIC.PANELIST_ALL_FEATURES_ENC")

# 2) Convert to pandas for sklearn
pdf = sf_df.to_pandas()

# 3) Identify columns
id_col = "PANELIST_ID"

all_cols = list(pdf.columns)
numeric_cols = [c for c in all_cols if c != id_col]

print("ID column:", id_col)
print("Number of numeric feature columns:", len(numeric_cols))
print("Sample feature columns:", numeric_cols[:10])
print("Number of rows:", len(pdf))

In [None]:
# Separate features (X) and ID
X_raw = pdf[numeric_cols].values

# 1) Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X_raw)

# 2) Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print("Shape of X_scaled:", X_scaled.shape)

In [None]:
# Isolation Forest model
iso = IsolationForest(
    n_estimators=200,     # number of trees
    contamination=0.02,  # assume ~2% anomalies (tune this!)
    random_state=42,
    n_jobs=-1            # use all cores if available
)

iso.fit(X_scaled)

# Predict:
# -1 = anomaly, 1 = normal
labels = iso.predict(X_scaled)
scores = -iso.decision_function(X_scaled)  # higher score = more anomalous

# Attach to dataframe
pdf["ANOMALY_LABEL"] = labels
pdf["ANOMALY_SCORE"] = scores

# For convenience, make a 0/1 flag
# 1 = anomaly, 0 = normal
pdf["ANOMALY_FLAG"] = (pdf["ANOMALY_LABEL"] == -1).astype(int)

# Quick summary
print("Total rows:", len(pdf))
print("Anomaly count:", pdf["ANOMALY_FLAG"].sum())
print("Anomaly %:", pdf["ANOMALY_FLAG"].mean() * 100)


In [None]:
# Collect feature importances from all trees
tree_importances = np.array([tree.feature_importances_ for tree in iso.estimators_])

# Average importance across all trees
feature_importance = tree_importances.mean(axis=0)

# Create a sorted Series
feat_imp = pd.Series(feature_importance, index=numeric_cols).sort_values(ascending=False)

print("Top 20 most important features:")
display(feat_imp.head(20))


In [None]:
# Optional: sample for plotting
max_points = 5000
n = len(pdf)

if n > max_points:
    sample_idx = np.random.choice(n, size=max_points, replace=False)
else:
    sample_idx = np.arange(n)

X_sample = X_scaled[sample_idx]
anom_sample = pdf["ANOMALY_FLAG"].values[sample_idx]

# 2D PCA
pca_vis = PCA(n_components=2, random_state=42)
X_pca_2d = pca_vis.fit_transform(X_sample)

plt.figure(figsize=(8, 6))
plt.scatter(
    X_pca_2d[:, 0],
    X_pca_2d[:, 1],
    c=anom_sample,
    s=8,
    alpha=0.7
)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Isolation Forest Anomalies (1 = red-ish, 0 = blue-ish)")
plt.tight_layout()
plt.show()


In [None]:
result_cols = [id_col, "ANOMALY_FLAG", "ANOMALY_SCORE"]

result_pdf = pdf[result_cols].copy()

result = session.write_pandas(
    result_pdf,
    "PANELIST_ISOFOREST_SCORED",
    auto_create_table=True,
    overwrite=True
)

print("write_pandas result:", result)


In [None]:
SELECT ANOMALY_FLAG, COUNT(*)
FROM PUBLIC.PANELIST_ISOFOREST_SCORED
GROUP BY ANOMALY_FLAG


In [None]:
# Split anomalies and normals
anom = pdf[pdf["ANOMALY_FLAG"] == 1].copy()
norm = pdf[pdf["ANOMALY_FLAG"] == 0].copy()

print("Anomalies:", len(anom))
print("Normals:", len(norm))


In [None]:
# Select features of the first anomaly
anomaly_row_raw = anom[numeric_cols].iloc[0].values.reshape(1, -1)

# Scale it (SHAP requires scaled values)
anomaly_row_scaled = scaler.transform(anomaly_row_raw)


In [None]:
import shap

# Create SHAP explainer for Isolation Forest
explainer = shap.TreeExplainer(iso)

# Compute SHAP values for the single anomaly
shap_values_single = explainer.shap_values(anomaly_row_scaled)


In [None]:
anomaly_row = anom.iloc[0][numeric_cols].values.reshape(1, -1)
shap_values_single = explainer.shap_values(scaler.transform(anomaly_row))

shap.force_plot(
    explainer.expected_value,
    shap_values_single,
    anomaly_row,
    feature_names=numeric_cols
)


In [None]:
shap.summary_plot(
    shap_values_single,
    anomaly_row_scaled,
    feature_names=numeric_cols
)
