# Unsupervised Attack Detection Models (MovieLens 1M)

This notebook implements the unsupervised detection setup for:
- Isolation Forest
- One-Class SVM
- Autoencoder (Deep Learning)

Input: `movielens1m_user_features.csv` generated by the preprocessing/feature engineering notebook.
Output:
- Anomaly scores per user for each model
- A unified ranking of suspicious users
- Basic agreement analysis across models


In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

ROOT = os.getcwd()
FEATURES_PATH = os.path.join(ROOT, "movielens1m_user_features.csv")

pd.set_option("display.max_columns", 200)


Load features

In [2]:
features = pd.read_csv(FEATURES_PATH)

assert "user_id" in features.columns, "Expected 'user_id' column in features CSV."

print("Loaded features:", features.shape)
features.head()


Loaded features: (6040, 21)


Unnamed: 0,user_id,num_ratings,mean_rating,std_rating,min_rating,max_rating,entropy_rating,ratio_1,ratio_5,extreme_ratio,mean_abs_dev,delta_mean_s,delta_std_s,profile_span_s,ratings_per_day,burst_ratio_10min,index,mean_item_pop,std_item_pop,min_item_pop,max_item_pop
0,1,53,4.188679,0.674512,3.0,5.0,1.436588,0.0,0.339623,0.339623,0.551086,10083.307692,71662.866914,524332.0,8.733398,0.943396,0,1135.830189,750.967118,73,2991
1,2,129,3.713178,0.997624,1.0,5.0,1.953184,0.015504,0.263566,0.27907,0.851752,16.015625,20.706273,2050.0,5436.878049,0.992248,1,1012.790698,718.464056,47,3428
2,3,51,3.901961,0.975281,1.0,5.0,1.883402,0.019608,0.294118,0.313725,0.730488,29.72,52.37711,1486.0,2965.275908,0.980392,2,1376.784314,838.632625,92,3428
3,4,21,4.190476,1.051939,1.0,5.0,1.66759,0.047619,0.47619,0.52381,0.770975,17.9,42.399175,358.0,5068.156425,0.952381,3,1708.809524,829.642127,450,2991
4,5,196,3.147959,1.130986,1.0,5.0,2.16235,0.096939,0.107143,0.204082,0.918524,28.271795,70.647648,5513.0,3071.721386,0.989796,4,760.755102,671.144006,21,3428


Build X matrix + scaling

In [3]:
user_ids = features["user_id"].astype(int).values
X = features.drop(columns=["user_id"]).copy()

# Standardize for fair comparison (especially important for OC-SVM and Autoencoder)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("X_scaled:", X_scaled.shape)


X_scaled: (6040, 20)


Helper utilities (scoring + ranking)

In [4]:
def minmax_scale(a: np.ndarray) -> np.ndarray:
    a = np.asarray(a, dtype=float)
    mn, mx = np.min(a), np.max(a)
    if mx - mn < 1e-12:
        return np.zeros_like(a)
    return (a - mn) / (mx - mn)

def top_k_table(user_ids: np.ndarray, scores: np.ndarray, k: int = 30, score_name: str = "score") -> pd.DataFrame:
    df = pd.DataFrame({"user_id": user_ids, score_name: scores})
    df = df.sort_values(score_name, ascending=False).head(k).reset_index(drop=True)
    return df


Fit Isolation Forest + anomaly scores

In [5]:
CONTAMINATION = 0.02  # 2% suspicious as an initial working point

iso = IsolationForest(
    n_estimators=300,
    contamination=CONTAMINATION,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_scaled)

# sklearn: score_samples -> higher means more normal
# We invert it so that higher = more anomalous
iso_score = -iso.score_samples(X_scaled)
iso_score_n = minmax_scale(iso_score)

print("IsolationForest score range:", iso_score.min(), iso_score.max())
top_k_table(user_ids, iso_score, k=20, score_name="iso_score")


IsolationForest score range: 0.35925518925986305 0.6736949552350892


Unnamed: 0,user_id,iso_score
0,4486,0.673695
1,164,0.668234
2,4582,0.645621
3,4638,0.64464
4,4634,0.640513
5,5411,0.633779
6,4273,0.63237
7,2111,0.631311
8,160,0.627259
9,609,0.623321


Fit OC-SVM + anomaly scores

In [6]:
NU = 0.02  # aligned with contamination for comparability

ocsvm = OneClassSVM(
    kernel="rbf",
    nu=NU,
    gamma="scale"
)

ocsvm.fit(X_scaled)

# decision_function: higher = more normal, lower = more anomalous
svm_dec = ocsvm.decision_function(X_scaled).ravel()
svm_score = -svm_dec  # higher = more anomalous
svm_score_n = minmax_scale(svm_score)

print("OC-SVM score range:", svm_score.min(), svm_score.max())
top_k_table(user_ids, svm_score, k=20, score_name="svm_score")


OC-SVM score range: -1.930440362642475 1.6486909449245934


Unnamed: 0,user_id,svm_score
0,3598,1.648691
1,4486,1.648691
2,2744,1.508327
3,195,1.339602
4,4751,1.280904
5,5635,1.187723
6,4463,1.179753
7,46,1.011191
8,5411,0.940429
9,4874,0.888045


Autoencoder (Deep Learning)

In [7]:
# Try PyTorch first
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
ds = TensorDataset(X_tensor)
dl = DataLoader(ds, batch_size=256, shuffle=True)

n_in = X_scaled.shape[1]

class AutoEncoder(nn.Module):
    def __init__(self, n_in: int):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(n_in, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, n_in),
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

ae = AutoEncoder(n_in).to(device)

criterion = nn.MSELoss(reduction="none")
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)

EPOCHS = 25

ae.train()
for epoch in range(1, EPOCHS + 1):
    losses = []
    for (xb,) in dl:
        xb = xb.to(device)
        optimizer.zero_grad()
        xhat = ae(xb)
        loss = criterion(xhat, xb).mean()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:02d}/{EPOCHS} - loss: {np.mean(losses):.6f}")


Device: cuda
Epoch 01/25 - loss: 0.962515
Epoch 05/25 - loss: 0.266018
Epoch 10/25 - loss: 0.074246
Epoch 15/25 - loss: 0.037370
Epoch 20/25 - loss: 0.027500
Epoch 25/25 - loss: 0.022136


In [8]:
ae.eval()
with torch.no_grad():
    Xb = X_tensor.to(device)
    Xhat = ae(Xb).cpu().numpy()

recon_err = np.mean((X_scaled - Xhat) ** 2, axis=1)  # MSE per user
ae_score = recon_err
ae_score_n = minmax_scale(ae_score)

print("AE score range:", ae_score.min(), ae_score.max())
top_k_table(user_ids, ae_score, k=20, score_name="ae_score")


AE score range: 0.001988441293809439 0.5943741578014935


Unnamed: 0,user_id,ae_score
0,4486,0.594374
1,3598,0.457052
2,4463,0.427355
3,46,0.399165
4,195,0.319919
5,4550,0.310708
6,164,0.307285
7,311,0.303125
8,2362,0.262272
9,5635,0.24868


Combined ranking + agreement analysis
Merge scores into one table

In [9]:
scores = pd.DataFrame({
    "user_id": user_ids,
    "iso_score": iso_score,
    "svm_score": svm_score,
    "ae_score":  ae_score,
    # normalized for comparability
    "iso_score_n": iso_score_n,
    "svm_score_n": svm_score_n,
    "ae_score_n":  ae_score_n,
})

# Simple ensemble: mean of normalized anomaly scores
scores["ensemble_score"] = scores[["iso_score_n", "svm_score_n", "ae_score_n"]].mean(axis=1)

scores = scores.sort_values("ensemble_score", ascending=False).reset_index(drop=True)
scores.head(20)


Unnamed: 0,user_id,iso_score,svm_score,ae_score,iso_score_n,svm_score_n,ae_score_n,ensemble_score
0,4486,0.673695,1.648691,0.594374,1.0,1.0,1.0,1.0
1,3598,0.616883,1.648691,0.457052,0.819323,1.0,0.768188,0.862504
2,4463,0.604229,1.179753,0.427355,0.77908,0.86898,0.718057,0.788706
3,46,0.598529,1.011191,0.399165,0.760954,0.821884,0.67047,0.751103
4,164,0.668234,0.657408,0.307285,0.982632,0.723038,0.515367,0.740346
5,5635,0.60513,1.187723,0.24868,0.781946,0.871207,0.416437,0.689863
6,5411,0.633779,0.940429,0.20631,0.873057,0.802114,0.344913,0.673361
7,89,0.622205,0.808184,0.247933,0.83625,0.765165,0.415177,0.672197
8,2744,0.57209,1.508327,0.224115,0.676869,0.960783,0.374969,0.670874
9,195,0.53171,1.339602,0.319919,0.54845,0.913641,0.536694,0.666262


Top-K suspicious users (per model + ensemble)

In [10]:
K = 30

print("Top-K (Ensemble):")
display(scores[["user_id","ensemble_score","iso_score_n","svm_score_n","ae_score_n"]].head(K))

print("\nTop-K (Isolation Forest):")
display(top_k_table(user_ids, iso_score, k=K, score_name="iso_score"))

print("\nTop-K (OC-SVM):")
display(top_k_table(user_ids, svm_score, k=K, score_name="svm_score"))

print("\nTop-K (Autoencoder):")
display(top_k_table(user_ids, ae_score, k=K, score_name="ae_score"))


Top-K (Ensemble):


Unnamed: 0,user_id,ensemble_score,iso_score_n,svm_score_n,ae_score_n
0,4486,1.0,1.0,1.0,1.0
1,3598,0.862504,0.819323,1.0,0.768188
2,4463,0.788706,0.77908,0.86898,0.718057
3,46,0.751103,0.760954,0.821884,0.67047
4,164,0.740346,0.982632,0.723038,0.515367
5,5635,0.689863,0.781946,0.871207,0.416437
6,5411,0.673361,0.873057,0.802114,0.344913
7,89,0.672197,0.83625,0.765165,0.415177
8,2744,0.670874,0.676869,0.960783,0.374969
9,195,0.666262,0.54845,0.913641,0.536694



Top-K (Isolation Forest):


Unnamed: 0,user_id,iso_score
0,4486,0.673695
1,164,0.668234
2,4582,0.645621
3,4638,0.64464
4,4634,0.640513
5,5411,0.633779
6,4273,0.63237
7,2111,0.631311
8,160,0.627259
9,609,0.623321



Top-K (OC-SVM):


Unnamed: 0,user_id,svm_score
0,3598,1.648691
1,4486,1.648691
2,2744,1.508327
3,195,1.339602
4,4751,1.280904
5,5635,1.187723
6,4463,1.179753
7,46,1.011191
8,5411,0.940429
9,4874,0.888045



Top-K (Autoencoder):


Unnamed: 0,user_id,ae_score
0,4486,0.594374
1,3598,0.457052
2,4463,0.427355
3,46,0.399165
4,195,0.319919
5,4550,0.310708
6,164,0.307285
7,311,0.303125
8,2362,0.262272
9,5635,0.24868


Agreement between models (overlap of top-K)

In [11]:
def topk_set(df: pd.DataFrame, col: str, k: int) -> set:
    return set(df.sort_values(col, ascending=False).head(k)["user_id"].astype(int).tolist())

top_iso = topk_set(scores, "iso_score_n", K)
top_svm = topk_set(scores, "svm_score_n", K)
top_ae  = topk_set(scores, "ae_score_n",  K)
top_ens = topk_set(scores, "ensemble_score", K)

def jaccard(a: set, b: set) -> float:
    return len(a & b) / len(a | b) if len(a | b) else 0.0

agreement = pd.DataFrame({
    "pair": ["ISO vs SVM", "ISO vs AE", "SVM vs AE", "ENS vs ISO", "ENS vs SVM", "ENS vs AE"],
    "jaccard_topK": [
        jaccard(top_iso, top_svm),
        jaccard(top_iso, top_ae),
        jaccard(top_svm, top_ae),
        jaccard(top_ens, top_iso),
        jaccard(top_ens, top_svm),
        jaccard(top_ens, top_ae),
    ]
})
agreement


Unnamed: 0,pair,jaccard_topK
0,ISO vs SVM,0.25
1,ISO vs AE,0.2
2,SVM vs AE,0.333333
3,ENS vs ISO,0.428571
4,ENS vs SVM,0.538462
5,ENS vs AE,0.5


Save outputs (for paper tables / later inspection)

In [12]:
OUT_PATH = os.path.join(ROOT, "movielens1m_unsupervised_scores.csv")
scores.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Saved: C:\Users\USUARIO\Desktop\app\movielens1m_unsupervised_scores.csv
