#### Train Our mouse telemetry detection model

In [2]:
import os
import re
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
print(torch.__version__)


2.6.0


#### Extract and preporcess mouse movement

The dataset is not copied here, Please chose your own dataset and Use this file just as a reference.

This file will not work as we don't have dataset for it to train in this git reprository as Git has some space limitation for free tier

In [None]:
def parse_coordinates(coord_str):
    return re.findall(r'\[(\d+),(\d+)\]', coord_str)

def parse_timestamps(timestamps_str):
    return timestamps_str.strip(',').split(',')

def process_file(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    session_id = data.get("session_id")
    coords_raw = data.get("mousemove_total_behaviour", "")
    times_raw = data.get("mousemove_times", "")
    
    coords = parse_coordinates(coords_raw)
    times = parse_timestamps(times_raw)
    
    # Ensure equal length
    length = min(len(coords), len(times))
    coords = coords[:length]
    times = times[:length]
    
    records = []
    for i in range(length):
        x, y = coords[i]
        timestamp = times[i]
        records.append({
            "session_id": session_id,
            "timestamp": int(timestamp),
            "x": int(x),
            "y": int(y)
        })
    
    return records

def read_all_jsons(root_dir):
    all_records = []
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.json'):
                filepath = os.path.join(subdir, file)
                try:
                    records = process_file(filepath)
                    all_records.extend(records)
                except Exception as e:
                    print(f"Failed to process {filepath}: {e}")
    return pd.DataFrame(all_records)

# Example usage
root_directory = './phase1/data/mouse_movements'
df = read_all_jsons(root_directory)


In [None]:
label_df = pd.read_csv("./phase1/final", delim_whitespace=True, header=None, names=["session_id", "label"])
label_map = dict(zip(label_df["session_id"], label_df["label"]))

  label_df = pd.read_csv("./phase1/final", delim_whitespace=True, header=None, names=["session_id", "label"])


In [63]:
df["label"] = df["session_id"].map(label_map)
print(df["label"].value_counts())  # Check how many humans vs bots
df = df.drop_duplicates()


label
human           1071630
advanced_bot     368340
moderate_bot     199026
Name: count, dtype: int64


In [64]:
# Convert 'human' to 0, everything else to 1
df['label'] = df['label'].apply(lambda x: 0 if str(x).strip().lower() == 'human' else 1)
print(df)

                         session_id  timestamp   x    y  label
0        v3hbnujku5f0bm2mmbjqrr9rg8  621539491   7    6      1
1        v3hbnujku5f0bm2mmbjqrr9rg8  621539502  15   14      1
2        v3hbnujku5f0bm2mmbjqrr9rg8  621539518  23   22      1
3        v3hbnujku5f0bm2mmbjqrr9rg8  621539535  31   30      1
4        v3hbnujku5f0bm2mmbjqrr9rg8  621539552  39   38      1
...                             ...        ...  ..  ...    ...
1595686  vllcsfvm4m4b6559eckqpg3mvq  445334161  45  183      1
1595687  vllcsfvm4m4b6559eckqpg3mvq  445334177  44  183      1
1595688  vllcsfvm4m4b6559eckqpg3mvq  445334194  43  183      1
1595689  vllcsfvm4m4b6559eckqpg3mvq  445334211  42  182      1
1595690  vllcsfvm4m4b6559eckqpg3mvq  445334227  42  182      1

[1103181 rows x 5 columns]


In [65]:
print(df["label"].value_counts())  # Check how many humans vs bots

label
1    567366
0    535815
Name: count, dtype: int64


In [None]:
def create_sequences_with_labels(df, seq_length=20):
    sequences = []
    labels = []
    session_count = 0
    total_skipped = 0

    # Clean session_id
    df["session_id"] = df["session_id"].astype(str).str.strip()
    for session_id, group in df.groupby("session_id"):
        session_count += 1
        group = group.sort_values("timestamp").reset_index(drop=True)
        if len(group) < seq_length:
            total_skipped += 1
            continue

        group = group.copy()
        group["dt"] = group["timestamp"].diff().fillna(0).astype(float)
        data = group[["x", "y", "dt"]].values
        label = group["label"].iloc[0]  # same for entire session

        for i in range(len(data) - seq_length + 1):
            sequences.append(data[i:i+seq_length].copy())
            labels.append(label)

    print(f"Sessions processed: {session_count}")
    print(f"Sessions skipped (too short): {total_skipped}")
    print(f"Total sequences created: {len(sequences)}")

    return np.array(sequences), np.array(labels)

# Example usage
sequences, y = create_sequences_with_labels(df, seq_length=20)



Sessions processed: 150
Sessions skipped (too short): 0
Total sequences created: 1100331


In [67]:
print("X shape:", sequences.shape)  # (num_sequences, seq_length, 3)
print("y shape:", y.shape)          # (num_sequences,)
print("y sample:", set(y))

X shape: (1100331, 20, 3)
y shape: (1100331,)
y sample: {np.int64(0), np.int64(1)}


In [68]:
def normalize_sequence_minmax(seq):
    # seq: (20, 3) — one sequence
    seq = seq.astype(np.float32)
    normalized = np.zeros_like(seq)
    eps = 1e-8  # avoid divide by zero
    for i in range(seq.shape[1]):  # for each column (x, y, dt)
        col = seq[:, i]
        col_min = np.min(col)
        col_max = np.max(col)
        normalized[:, i] = (col - col_min) / (col_max - col_min + eps)

    return normalized
sequences = np.array([normalize_sequence_minmax(seq) for seq in sequences])

#### Our new sequence structure

In [69]:
print(sequences)

[[[0.         0.         0.        ]
  [0.05517241 0.05263158 0.47058824]
  [0.10344828 0.10526316 0.9411765 ]
  ...
  [0.8965517  0.8947368  0.9411765 ]
  [0.9517241  0.94736844 1.        ]
  [1.         1.         1.        ]]

 [[0.         0.         0.        ]
  [0.04827586 0.05263158 0.8888889 ]
  [0.10344828 0.10526316 1.        ]
  ...
  [0.8965517  0.8947368  1.        ]
  [0.94482756 0.94736844 1.        ]
  [1.         1.         0.8888889 ]]

 [[0.         0.         0.        ]
  [0.05517241 0.05263158 1.        ]
  [0.11034483 0.10526316 1.        ]
  ...
  [0.8965517  0.8947368  1.        ]
  [0.9517241  0.94736844 0.        ]
  [1.         1.         1.        ]]

 ...

 [[1.         0.         1.        ]
  [0.9354839  0.06796116 1.        ]
  [0.87096775 0.13592233 0.        ]
  ...
  [0.01612903 0.98058254 0.        ]
  [0.00806452 0.99029124 1.        ]
  [0.         1.         1.        ]]

 [[1.         0.         1.        ]
  [0.93162394 0.07216495 0.        ]


In [None]:
# Convert to torch tensors
X_tensor = torch.tensor(sequences, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Create full dataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split lengths
total_len = len(dataset)
train_len = int(0.9 * total_len)
test_len = int(0.05 * total_len)
val_len = total_len - train_len - test_len

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_len, val_len, test_len])

# Dataloaders
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

# Optional sanity check
for xb, yb in train_loader:
    print("Batch X shape:", xb.shape)  # (batch_size, seq_length, 3)
    print("Batch y shape:", yb.shape)  # (batch_size,)
    break


[1 1 1 ... 1 1 1]
Batch X shape: torch.Size([512, 20, 3])
Batch y shape: torch.Size([512])


You can use your own method for preprocessing the data; the code above is meant for that purpose. You can then modify the code below to train the model.

#### Device selection for better training

In [None]:
# 3. Device Setup
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


#### Our LSTM based model that is to be deployed for anti scrapper/bot detection tool

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=16, lstm_layers=1, fc_dims=[3,1], seq_len=20):
        super(LSTMClassifier, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )

        self.fc1 = nn.Linear(hidden_dim * 2, fc_dims[0])
        self.fc2 = nn.Linear(fc_dims[0], fc_dims[1])

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()  # for binary classification output

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        h_combined = torch.cat((h_forward, h_backward), dim=1)  # (batch, hidden_dim * 2)

        out = self.relu(self.fc1(h_combined))
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(
    input_dim=3, 
    hidden_dim=32, 
    lstm_layers=1,
    fc_dims=[2, 1],
    seq_len=20
).to(device)

criterion = nn.BCELoss()  # Binary Cross Entropy
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [75]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.float().to(device)
        output = model(xb).squeeze()
        loss = criterion(output, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Accuracy calculation
        preds = (output >= 0.5).float()
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    avg_loss = total_loss / len(loader)
    accuracy = correct / total
    return avg_loss, accuracy


def validate_epoch(model, loader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.float().to(device)
            output = model(xb).squeeze()
            loss = criterion(output, yb)
            total_loss += loss.item()

            # Accuracy calculation
            preds = (output >= 0.5).float()
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    avg_loss = total_loss / len(loader)
    accuracy = correct / total
    return avg_loss, accuracy


In [None]:
# 1. Define Scheduler
scheduler = StepLR(optimizer, step_size=20, gamma=0.5)  # Halve LR every 20 epochs

# -------------------------
# 2. Training Loop with Scheduler
epochs = 40

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

for epoch in range(1, epochs + 1):
    train_loss, train_acc = train_epoch(model, train_loader)
    val_loss, val_acc = validate_epoch(model, val_loader)

    print(f"[Epoch {epoch:02d}] "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc*100:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc*100:.2f}%")


In [78]:
torch.save(model.state_dict(), "model_weightsless.pth")

In [28]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def test_with_report_and_confusion(model, loader):
    model.eval()
    all_preds = []
    all_targets = []
    all_inputs = []

    with torch.no_grad():
        for xb, yb in loader:
            xb_device = xb.to(device)
            output = model(xb_device).squeeze().cpu()
            preds = (output >= 0.5).int()

            all_inputs.extend(xb.cpu())      # Save for later plotting
            all_preds.extend(preds.tolist())
            all_targets.extend(yb.tolist())

    # Classification Report
    print("\n📊 Classification Report:")
    print(classification_report(all_targets, all_preds, target_names=["Human (0)", "Bot (1)"]))

    # Confusion Matrix
    cm = confusion_matrix(all_targets, all_preds)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=["Human (0)", "Bot (1)"],
                yticklabels=["Human (0)", "Bot (1)"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("🧩 Confusion Matrix")
    plt.show()

    return all_inputs, all_targets, all_preds

def visualize_confusion_examples(all_inputs, all_targets, all_preds, max_samples=5):
    categories = {
        "True Positive (Bot Correct)": [],
        "True Negative (Human Correct)": [],
        "False Positive (Human → Bot)": [],
        "False Negative (Bot → Human)": [],
    }

    for x, true, pred in zip(all_inputs, all_targets, all_preds):
        if true == 1 and pred == 1:
            categories["True Positive (Bot Correct)"].append(x)
        elif true == 0 and pred == 0:
            categories["True Negative (Human Correct)"].append(x)
        elif true == 0 and pred == 1:
            categories["False Positive (Human → Bot)"].append(x)
        elif true == 1 and pred == 0:
            categories["False Negative (Bot → Human)"].append(x)

    # Plot sequences
    for title, samples in categories.items():
        print(f"\n📌 {title} — Showing {min(max_samples, len(samples))} samples:")
        selected = random.sample(samples, min(max_samples, len(samples)))
        for i, seq in enumerate(selected, 1):
            x_vals = seq[:, 0]
            y_vals = seq[:, 1]
            plt.plot(x_vals, y_vals, marker='o')
            plt.title(f"{title} #{i}")
            plt.xlabel("x")
            plt.ylabel("y")
            plt.grid(True)
            plt.show()



### Out client side model that will work on Javascript

In [None]:
class JSModel(nn.Module):
    def __init__(self, original_model):
        super(JSModel, self).__init__()
        self.lstm = original_model.lstm
        self.fc1 = original_model.fc1
        self.relu = nn.ReLU()

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        h_combined = torch.cat((h_forward, h_backward), dim=1)
        out = self.relu(self.fc1(h_combined))
        return out

dummy_input = torch.randn(1, 20, 3)  # (batch, seq_len, features)
js_model = JSModel(model)

The js_model.onnx will be used by JS to capture latent vectors

In [None]:
torch.onnx.export(js_model, dummy_input, "js_model.onnx", input_names=["input"], output_names=["output"])

## Get server model

In [82]:
class FinalClassifier(nn.Module):
    def __init__(self):
        super(FinalClassifier, self).__init__()
        self.fc2 = nn.Linear(2, 1)   # because JS sends 5 latent values
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.fc2(x))


state_dict = torch.load('model_weightsless.pth', map_location=device)

# Extract only fc2 weights
fc2_state_dict = {k.replace('fc2.', ''): v for k, v in state_dict.items() if k.startswith('fc2.')}

model = FinalClassifier().to(device)
model.fc2.load_state_dict(fc2_state_dict)
model.eval()


FinalClassifier(
  (fc2): Linear(in_features=2, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
# Only save last layer
torch.save(model.fc2.state_dict(), 'server_model.pth')