In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense
from geopy.distance import geodesic
import joblib
import warnings
from tqdm.auto import tqdm
from tqdm.keras import TqdmCallback

# Initialize tqdm for Pandas
tqdm.pandas()
warnings.filterwarnings('ignore')

import os
path=os.path.abspath('user_logins.csv')
# ==========================================
# PART 1: FEATURE ENGINEERING
# ==========================================
print("1. Loading and processing data...")
df = pd.read_csv(path) 

# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

# A. Time Diff
df['prev_time'] = df.groupby('user_id')['timestamp'].shift(1)
df['time_diff_hours'] = (df['timestamp'] - df['prev_time']).dt.total_seconds() / 3600
df['time_diff_hours'] = df['time_diff_hours'].fillna(0)

# B. Distance & Velocity
print("   Calculating distances...")
df['prev_lat'] = df.groupby('user_id')['lat'].shift(1)
df['prev_lon'] = df.groupby('user_id')['lon'].shift(1)

def get_geo_dist(row):
    if pd.isna(row['prev_lat']): return 0.0
    try:
        return geodesic((row['prev_lat'], row['prev_lon']), (row['lat'], row['lon'])).km
    except:
        return 0.0

df['dist_km'] = df.progress_apply(get_geo_dist, axis=1)
df['velocity_kmh'] = df['dist_km'] / (df['time_diff_hours'] + 0.1)

# C. Device Frequency
device_counts = df.groupby(['user_id', 'device']).size().reset_index(name='count')
total_counts = df.groupby('user_id').size().reset_index(name='total')
device_stats = pd.merge(device_counts, total_counts, on='user_id')
device_stats['device_trust_score'] = device_stats['count'] / device_stats['total']

df = pd.merge(df, device_stats[['user_id', 'device', 'device_trust_score']], 
              on=['user_id', 'device'], how='left')

df['hour_of_day'] = df['timestamp'].dt.hour

# ==========================================
# PART 2: MODEL TRAINING
# ==========================================
features = ['velocity_kmh', 'time_diff_hours', 'device_trust_score', 'hour_of_day']
X = df[features]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

# --- ISOLATION FOREST ---
print("\n2. Training Isolation Forest...")
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)
joblib.dump(iso_forest, 'model_isolation_forest.pkl')
joblib.dump(scaler, 'scaler.pkl')

# --- AUTOENCODER ---
print("3. Training Autoencoder...")
input_dim = X_train.shape[1]
autoencoder = Sequential([
    Input(shape=(input_dim,)),
    Dense(8, activation='relu'),
    Dense(4, activation='relu'),
    Dense(2, activation='relu'),
    Dense(4, activation='relu'),
    Dense(8, activation='relu'),
    Dense(input_dim, activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, verbose=0, callbacks=[TqdmCallback(verbose=0)])
autoencoder.save('model_autoencoder.h5')

# Calc Threshold
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 95)
print(f"   Autoencoder Threshold: {threshold:.5f}")

# ==========================================
# PART 3: VALIDATION (FIXED LABELS)
# ==========================================
print("\n4. Validating System...")

df['iso_pred'] = iso_forest.predict(X_scaled) 
df['ae_loss'] = np.mean(np.power(X_scaled - autoencoder.predict(X_scaled), 2), axis=1)

# --- FIX: Updated strings to match your new data generator ---
travel_attacks = df[df['attack_type'] == 'Impossible Travel']
caught_iso = travel_attacks[travel_attacks['iso_pred'] == -1]
caught_ae = travel_attacks[travel_attacks['ae_loss'] > threshold]

print(f"--- IMPOSSIBLE TRAVEL ATTACKS ---")
print(f"Total Injected: {len(travel_attacks)}")
print(f"Caught by iForest: {len(caught_iso)}")
print(f"Caught by Autoencoder: {len(caught_ae)}")

# --- FIX: Updated strings to match your new data generator ---
device_attacks = df[df['attack_type'] == 'Device Spoofing']
caught_iso_dev = device_attacks[device_attacks['iso_pred'] == -1]
caught_ae_dev = device_attacks[device_attacks['ae_loss'] > threshold]

print(f"\n--- DEVICE SPOOFING ATTACKS ---")
print(f"Total Injected: {len(device_attacks)}")
print(f"Caught by iForest: {len(caught_iso_dev)}")
print(f"Caught by Autoencoder: {len(caught_ae_dev)}")

print("\n✅ Phase 1 Complete. Models saved.")

  from .autonotebook import tqdm as notebook_tqdm


1. Loading and processing data...
   Calculating distances...


100%|██████████| 20000/20000 [00:04<00:00, 4163.23it/s]



2. Training Isolation Forest...
3. Training Autoencoder...


100%|██████████| 10/10 [00:16<00:00,  1.63s/epoch, loss=0.00628]


   Autoencoder Threshold: 0.01902

4. Validating System...
--- IMPOSSIBLE TRAVEL ATTACKS ---
Total Injected: 524
Caught by iForest: 340
Caught by Autoencoder: 244

--- DEVICE SPOOFING ATTACKS ---
Total Injected: 725
Caught by iForest: 373
Caught by Autoencoder: 438

✅ Phase 1 Complete. Models saved.
