# **Implementation of All Three Anomalies Proposed Solutions Executed in This Notebook.**

# **Anomaly 1: Public Transport & Contactless Payment Data**

In [None]:
!pip install diffprivlib

Collecting diffprivlib
  Downloading diffprivlib-0.6.6-py3-none-any.whl.metadata (10 kB)
Downloading diffprivlib-0.6.6-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.9/176.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffprivlib
Successfully installed diffprivlib-0.6.6


In [None]:
import math
import numpy as np
import pandas as pd
from diffprivlib.mechanisms import Laplace

In [None]:
BASE_EPSILON = 0.5
LOW_ENTROPY_FACTOR = 0.5
H_MIN = 1.0
ROUND_RESULTS = True
SENSITIVITY = 1.0

In [None]:
csv_path = "transit_synthetic_30.csv"  # path created earlier
df = pd.read_csv(csv_path)

In [None]:
df.head()

Unnamed: 0,UserID,Station,Time
0,U8,StationA,09:00
1,U6,StationB,07:00
2,U1,StationC,06:00
3,U5,StationA,09:00
4,U4,StationB,09:00


In [None]:
# Ensure Time -> Hour bucket (string "HH:MM" kept)
df["Hour"] = df["Time"].astype(str).str.slice(0,5)

In [None]:
# If we want at most one contribution per user per (Station, Hour), dedupe:
df_dedup = df.drop_duplicates(subset=["UserID", "Station", "Hour"]).copy()

In [None]:
# Aggregate: collect user lists per (Station, Hour)
grouped = df_dedup.groupby(["Station", "Hour"])["UserID"].agg(list).reset_index()
grouped["true_count"] = grouped["UserID"].apply(len)

In [None]:
grouped

Unnamed: 0,Station,Hour,UserID,true_count
0,StationA,06:00,"[U6, U2, U7]",3
1,StationA,07:00,"[U4, U8, U3, U7, U9]",5
2,StationA,09:00,"[U8, U5, U1]",3
3,StationB,06:00,"[U3, U9]",2
4,StationB,07:00,[U6],1
5,StationB,08:00,[U5],1
6,StationB,09:00,"[U4, U6]",2
7,StationC,06:00,"[U1, U5, U4]",3
8,StationC,07:00,[U6],1
9,StationC,08:00,"[U7, U5]",2


In [None]:
def shannon_entropy_from_userlist(user_list):
    if not user_list:
        return 0.0
    counts = {}
    for u in user_list:
        counts[u] = counts.get(u, 0) + 1
    total = sum(counts.values())
    entropy = 0.0
    for c in counts.values():
        p = c / total
        entropy -= p * math.log2(p)
    return entropy

In [None]:
def laplace_mechanism(value, epsilon, sensitivity=SENSITIVITY):
    if epsilon <= 0:
        raise ValueError("Epsilon must be > 0")

    mech = Laplace(epsilon=epsilon, sensitivity=sensitivity)
    noisy = mech.randomise(value)
    return noisy

In [None]:
results = []
for _, row in grouped.iterrows():
    station = row["Station"]
    hour = row["Hour"]
    users = row["UserID"]
    true_cnt = row["true_count"]

    # compute entropy
    H = shannon_entropy_from_userlist(users)

    # choose epsilon based on entropy
    if H < H_MIN:
        eps = BASE_EPSILON * LOW_ENTROPY_FACTOR
    else:
        eps = BASE_EPSILON

    noisy_value = laplace_mechanism(true_cnt, epsilon=eps, sensitivity=SENSITIVITY)

    if ROUND_RESULTS:
        noisy_value = int(round(noisy_value))
        noisy_value = max(noisy_value, 0)

    results.append({
        "Station": station,
        "Hour": hour,
        "true_count": true_cnt,
        "entropy_bits": H,
        "epsilon_used": eps,
        "noisy_count": noisy_value
    })

In [None]:
results_df = pd.DataFrame(results).sort_values(["Station", "Hour"]).reset_index(drop=True)
print("Entropy-aware DP counts (using diffprivlib if installed):")
print(results_df.to_string(index=False))

Entropy-aware DP counts (using diffprivlib if installed):
 Station  Hour  true_count  entropy_bits  epsilon_used  noisy_count
StationA 06:00           3      1.584963          0.50            2
StationA 07:00           5      2.321928          0.50            8
StationA 09:00           3      1.584963          0.50            3
StationB 06:00           2      1.000000          0.50            3
StationB 07:00           1      0.000000          0.25            8
StationB 08:00           1      0.000000          0.25            2
StationB 09:00           2      1.000000          0.50            2
StationC 06:00           3      1.584963          0.50            2
StationC 07:00           1      0.000000          0.25            4
StationC 08:00           2      1.000000          0.50            2
StationC 09:00           2      1.000000          0.50            2


# **Anomaly 2: Location Data Exploitation in GeoFencing Apps**

In [None]:
!pip install phe

Collecting phe
  Downloading phe-1.5.0-py2.py3-none-any.whl.metadata (3.8 kB)
Downloading phe-1.5.0-py2.py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: phe
Successfully installed phe-1.5.0


In [None]:
from phe import paillier
import math

In [None]:
# Utilities: fixed-point encoding

def encode_coord(value: float, scale: int) -> int:
    """Encode a floating point coordinate to a scaled integer."""
    return int(round(value * scale))

def encode_square(value: float, scale: int) -> int:
    """Encode a square (value**2) using square-scale = scale**2."""
    return int(round((value * value) * (scale * scale)))

def decode_square_int(value_int: int, scale: int) -> float:
    """Decode integer that was encoded with scale**2 back to float."""
    return value_int / (scale * scale)

In [None]:
# Key setup (run once, or distributed)

def generate_keys(n_length: int = 2048):

    public_key, private_key = paillier.generate_paillier_keypair(n_length=n_length)
    return public_key, private_key

In [None]:
# Client-side: encrypt location
def client_encrypt_location(pubkey, x: float, y: float, scale: int = 10**6):
    x_scaled = encode_coord(x, scale)
    y_scaled = encode_coord(y, scale)
    x2_scaled = encode_square(x, scale)   # scaled by scale**2
    y2_scaled = encode_square(y, scale)

    enc_x = pubkey.encrypt(x_scaled)
    enc_y = pubkey.encrypt(y_scaled)
    enc_x2 = pubkey.encrypt(x2_scaled)
    enc_y2 = pubkey.encrypt(y2_scaled)

    return {
        "enc_x": enc_x,
        "enc_y": enc_y,
        "enc_x2": enc_x2,
        "enc_y2": enc_y2,
        "scale": scale  # so server/client can stay consistent
    }

In [None]:
# Server-side: homomorphic computation
def server_compute_encrypted_distance_squared(pubkey, encrypted_bundle, fence_center, scale:int = None):
    enc_x = encrypted_bundle["enc_x"]
    enc_y = encrypted_bundle["enc_y"]
    enc_x2 = encrypted_bundle["enc_x2"]
    enc_y2 = encrypted_bundle["enc_y2"]
    if scale is None:
        scale = encrypted_bundle.get("scale", 10**6)

    xg, yg = fence_center
    # scaled integer versions for center
    xg_scaled = encode_coord(xg, scale)
    yg_scaled = encode_coord(yg, scale)

    # Term1: enc_x2 + enc_y2  (already scaled by scale**2)
    enc_term1 = enc_x2 + enc_y2  # EncryptedNumber supports addition

    # Term2: -2 * (xg * x + yg * y)
    # For enc_x which is x_scaled, multiply by (-2 * xg_scaled)
    # This yields -2 * xg_scaled * x_scaled = -2 * xg * x * scale**2 (correct units)
    enc_minus2_xg_x = enc_x * (-2 * xg_scaled)
    enc_minus2_yg_y = enc_y * (-2 * yg_scaled)

    enc_term2 = enc_minus2_xg_x + enc_minus2_yg_y

    # Term3: constant (xg^2 + yg^2) scaled by scale**2
    const_val = encode_square(xg, scale) + encode_square(yg, scale)
    enc_const = pubkey.encrypt(const_val)

    # Sum all encrypted terms
    enc_D2 = enc_term1 + enc_term2 + enc_const

    return enc_D2  # This represents D^2 scaled by scale**2

In [None]:
# Client-side: decrypt & decide
def client_decrypt_and_check(privkey, enc_D2, radius: float, scale: int = 10**6):
    dec_int = privkey.decrypt(enc_D2)  # integer representing D^2 * scale**2
    D2 = dec_int / (scale * scale)     # float D^2 in original coordinate units

    inside = D2 <= (radius * radius)
    return D2, inside

In [None]:
# Demonstration example
def demo():
    print("=== Geo-fence Paillier demo ===")
    # 1) Key generation (server OR secure device). We'll do local for demo.
    pubkey, privkey = generate_keys(n_length=1024)
    print("Keys generated.")

    # Choose scale (client and server must agree)
    scale = 10**6

    # 2) Client: location
    client_x = 12.9715987    # example: Bangalore lat (just example)
    client_y = 77.5945627    # example lon
    client_bundle = client_encrypt_location(pubkey, client_x, client_y, scale=scale)
    print("Client: encrypted location and squares sent to server.")

    # 3) Server: fence center and radius
    fence_center = (12.9710, 77.5940)   # close point
    radius_mimic = 0.0009               # in same coordinate units (deg) ~ depends on use-case
    enc_D2 = server_compute_encrypted_distance_squared(pubkey, client_bundle, fence_center, scale=scale)
    print("Server: computed encrypted D^2 and returned to client.")

    # 4) Client: decrypt and check
    D2_val, inside = client_decrypt_and_check(privkey, enc_D2, radius_mimic, scale=scale)
    print(f"Decrypted D^2 = {D2_val:.12f}")
    print(f"Radius^2 = {radius_mimic * radius_mimic:.12f}")
    print("User Inside fence?" , inside)

    return {
        "pubkey": pubkey,
        "privkey": privkey,
        "client_bundle": client_bundle,
        "enc_D2": enc_D2,
        "D2": D2_val,
        "inside": inside
    }

if __name__ == "__main__":
    demo()

=== Geo-fence Paillier demo ===
Keys generated.
Client: encrypted location and squares sent to server.
Server: computed encrypted D^2 and returned to client.
Decrypted D^2 = -0.000053663928
Radius^2 = 0.000000810000
User Inside fence? True


# **Anomaly 3: ealth & Wearable Data Leakage in Federated Learning**

In [1]:
!pip install tensorflow



In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
# Hyperparameters
NUM_CLIENTS = 8
CLIENTS_PER_ROUND = 4
ROUNDS = 30
LOCAL_EPOCHS = 1
BATCH_SIZE = 32
SEQ_LEN = 128
NUM_FEATURES = 3
NUM_CLASSES = 6

CLIPPING_NORM = 1.0          # Gradient clipping bound
NOISE_MULTIPLIER = 1.1       # σ for Gaussian noise
LEARNING_RATE = 0.01

np.random.seed(0)
tf.random.set_seed(0)

In [4]:
# Synthetic data simulation
def generate_synthetic_client_data(num_clients, samples_per_client=500):
    clients_data = []
    for c in range(num_clients):
        X, y = [], []
        for i in range(samples_per_client):
            freq = np.random.uniform(0.1, 1.0) + (c * 0.05)
            t = np.linspace(0, 2 * np.pi, SEQ_LEN)
            signal = np.stack([
                np.sin(freq * t + np.random.randn() * 0.1),
                np.cos(freq * t + np.random.randn() * 0.1),
                0.1 * np.random.randn(SEQ_LEN)
            ], axis=-1)
            X.append(signal + 0.01 * np.random.randn(*signal.shape))
            y.append(int((freq * 3) % NUM_CLASSES))
        clients_data.append((np.array(X, np.float32), np.array(y, np.int32)))
    return clients_data

SAMPLES_PER_CLIENT = 600
clients_datasets = generate_synthetic_client_data(NUM_CLIENTS, SAMPLES_PER_CLIENT)

In [5]:
# Model architecture
def create_model():
    model = keras.Sequential([
        layers.Input(shape=(SEQ_LEN, NUM_FEATURES)),
        layers.Conv1D(32, 5, activation='relu'),
        layers.MaxPool1D(2),
        layers.Conv1D(64, 3, activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dense(NUM_CLASSES, activation='softmax')
    ])
    return model

In [6]:
# DP helper: compute noisy gradients
# ---------------------------
@tf.function
def dp_train_step(model, optimizer, x_batch, y_batch, clipping_norm, noise_stddev):
    with tf.GradientTape() as tape:
        preds = model(x_batch, training=True)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_batch, preds)
        loss = tf.reduce_mean(loss)
    grads = tape.gradient(loss, model.trainable_variables)

    # Clip gradients to have max L2 norm = clipping_norm
    norm = tf.linalg.global_norm(grads)
    clip_ratio = tf.minimum(1.0, clipping_norm / (norm + 1e-6))
    clipped_grads = [g * clip_ratio for g in grads]

    # Add Gaussian noise for differential privacy
    noisy_grads = [g + tf.random.normal(tf.shape(g), stddev=noise_stddev * clipping_norm)
                   for g in clipped_grads]

    optimizer.apply_gradients(zip(noisy_grads, model.trainable_variables))
    return loss

In [7]:
# Federated averaging helpers
def average_weights(weights_list):
    avg = []
    for weights in zip(*weights_list):
        avg.append(np.mean(np.stack(weights, axis=0), axis=0))
    return avg

def set_model_weights(model, weights):
    model.set_weights(weights)

In [8]:
# Federated training loop
global_model = create_model()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE)

for round_num in range(1, ROUNDS + 1):
    print(f"\n--- Round {round_num}/{ROUNDS} ---")
    selected_clients = np.random.choice(NUM_CLIENTS, CLIENTS_PER_ROUND, replace=False)
    client_weights = []
    client_losses = []

    for cid in selected_clients:
        Xc, yc = clients_datasets[cid]

        client_model = create_model()
        set_model_weights(client_model, global_model.get_weights())
        client_optimizer = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE)

        # local DP training
        ds = tf.data.Dataset.from_tensor_slices((Xc, yc)).shuffle(1024).batch(BATCH_SIZE)
        for epoch in range(LOCAL_EPOCHS):
            for xb, yb in ds:
                dp_loss = dp_train_step(client_model, client_optimizer, xb, yb,
                                        CLIPPING_NORM, NOISE_MULTIPLIER)
        client_weights.append(client_model.get_weights())
        client_losses.append(dp_loss.numpy())

    # Aggregate weights (FedAvg)
    new_global_weights = average_weights(client_weights)
    set_model_weights(global_model, new_global_weights)

    print(f"  Aggregated mean loss: {np.mean(client_losses):.4f}")

    # Evaluate on small combined test set
    X_test, y_test = [], []
    for cid in range(NUM_CLIENTS):
        Xc, yc = clients_datasets[cid]
        idx = np.random.choice(len(Xc), size=20, replace=False)
        X_test.append(Xc[idx])
        y_test.append(yc[idx])
    X_test = np.concatenate(X_test)
    y_test = np.concatenate(y_test)
    preds = np.argmax(global_model.predict(X_test, verbose=0), axis=1)
    acc = np.mean(preds == y_test)
    print(f"  Global model accuracy: {acc:.3f}")

print("\n=== Training complete ===")


--- Round 1/30 ---
  Aggregated mean loss: 1.6964
  Global model accuracy: 0.350

--- Round 2/30 ---
  Aggregated mean loss: 1.5126
  Global model accuracy: 0.312

--- Round 3/30 ---
  Aggregated mean loss: 1.3680
  Global model accuracy: 0.525

--- Round 4/30 ---
  Aggregated mean loss: 1.1374
  Global model accuracy: 0.694

--- Round 5/30 ---
  Aggregated mean loss: 1.0358
  Global model accuracy: 0.681

--- Round 6/30 ---
  Aggregated mean loss: 0.9582
  Global model accuracy: 0.694

--- Round 7/30 ---
  Aggregated mean loss: 1.0526
  Global model accuracy: 0.619

--- Round 8/30 ---
  Aggregated mean loss: 1.2412
  Global model accuracy: 0.694

--- Round 9/30 ---
  Aggregated mean loss: 1.5328
  Global model accuracy: 0.713

--- Round 10/30 ---
  Aggregated mean loss: 1.4569
  Global model accuracy: 0.800

--- Round 11/30 ---
  Aggregated mean loss: 1.1191
  Global model accuracy: 0.769

--- Round 12/30 ---
  Aggregated mean loss: 1.0254
  Global model accuracy: 0.812

--- Round 13