In [None]:
! pip install tensorflow

In [None]:
# important libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest # Another popular anomaly detection algo
from sklearn.svm import OneClassSVM
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
import random

In [None]:
# synthetic data creation
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import random
import math # For haversine distance
from datetime import datetime, timedelta

# --- Constants for GPS simulation ---
AVG_WALKING_SPEED = 5 # km/h
AVG_CAR_SPEED_LOCAL = 40 # km/h
AVG_CAR_SPEED_HIGHWAY = 90 # km/h
MAX_PLAUSIBLE_SPEED = 200 # km/h (for instant check, if user moves faster than this without vehicle)
LOCATION_JUMP_THRESHOLD_KM = 500 # KM - for impossible jumps (e.g., Jalandhar to London in a second)

# --- Utility Function: Haversine distance for calculating distance between two lat/lon points ---
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371 # Earth radius in kilometers

    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

# --- Behavioral Data Collection & Feature Extraction (Comprehensive Simulation) ---

def simulate_user_behavior_data(num_samples=100, user_id="user_A", is_fraudulent=False,
                                 start_lat=31.3260, start_lon=75.5762): # Jalandhar, Punjab, India
    """
    Simulates comprehensive behavioral data points for a user.
    Includes typing rhythm, detailed swipe/tap, device movement, app usage, and enhanced GPS/context.
    """
    data = []
    current_lat, current_lon = start_lat, start_lon
    prev_lat, prev_lon = start_lat, start_lon
    current_time = datetime(2025, 6, 18, 11, 0, 0) # Fixed start time for consistent time_of_day_anomaly simulation

    # Base profiles for normal vs. fraudulent behavior
    normal_profile = {
        'typing_speed_wpm': (60, 5), # mean, std
        'key_press_duration_ms': (80, 15),
        'time_between_keys_ms': (150, 30),
        'swipe_pressure_norm': (0.7, 0.1),
        'tap_duration_ms': (100, 20),
        'tap_count_per_min': (30, 8),
        'swipe_direction_angle_degrees': (180, 90), # Averages around 180 (downward swipe) with variance
        'device_angle_x_deg': (5, 2), # held mostly upright
        'accel_std_xyz': (0.05, 0.02), # low movement
        'gyro_z_avg_dps': (0.1, 0.05), # low rotation
        'nav_flow_speed_spm': (2.5, 0.5), # screens per minute
        'screen_time_seconds': (45, 15), # seconds on a screen
        'transaction_amount': (500, 100),
        'gps_speed_kmh_base': (AVG_WALKING_SPEED, 1), # Base for typical movement
        'ip_segment_base': '192.168.', # Local/stable IP segment
        'network_type_dist': ['WiFi'] * 8 + ['MobileData'] * 2 # Mostly WiFi
    }

    fraud_profile = {
        'typing_speed_wpm': (90, 10), # faster, less natural
        'key_press_duration_ms': (60, 10), # shorter, more uniform
        'time_between_keys_ms': (100, 20), # faster
        'swipe_pressure_norm': (0.9, 0.15), # harder or more erratic
        'tap_duration_ms': (70, 15), # quicker taps
        'tap_count_per_min': (60, 15), # faster tapping
        'swipe_direction_angle_degrees': (90, 120), # More random directions or specific to bot
        'device_angle_x_deg': (45, 15), # device held differently (e.g., on a desk, or remote)
        'accel_std_xyz_static': (0.005, 0.001), # very static (remote access)
        'accel_std_xyz_erratic': (0.5, 0.2), # very erratic (attacker fidgeting, or automated)
        'gyro_z_avg_dps': (1.0, 0.5), # more rotation (attacker adjusting)
        'nav_flow_speed_spm': (6.0, 1.0), # rapid navigation
        'screen_time_seconds': (15, 5), # very short screen times
        'transaction_amount': (5000, 2000), # larger, unusual transactions
        'gps_speed_kmh_base': (AVG_CAR_SPEED_HIGHWAY * 2, 50), # Implausible speed for non-travel
        'ip_segment_base': '104.23.', # VPN/proxy IP segment
        'network_type_dist': ['MobileData'] * 7 + ['VPN'] * 2 + ['Proxy'] * 1 # More mobile data/unstable connection
    }

    profile = fraud_profile if is_fraudulent else normal_profile

    for i in range(num_samples):
        # --- Timestamps ---
        # Simulate time passing, usually 1-5 seconds between data points
        current_time = current_time + timedelta(seconds=random.uniform(1, 5))
        timestamp = current_time

        # --- Typing Features ---
        typing_speed = max(1, random.gauss(*profile['typing_speed_wpm']))
        key_press_duration = max(1, random.gauss(*profile['key_press_duration_ms']))
        time_between_keys = max(1, random.gauss(*profile['time_between_keys_ms']))

        # --- Swipe/Tap Features ---
        swipe_pressure = max(0.01, min(1.0, random.gauss(*profile['swipe_pressure_norm']))) # Normalize 0-1
        tap_duration = max(1, random.gauss(*profile['tap_duration_ms']))
        tap_count_per_min = max(0, random.gauss(*profile['tap_count_per_min']))
        swipe_direction_angle = random.uniform(0, 360) # direction 0-360 degrees, can be more targeted for normal

        # --- Device Orientation and Movement (Accelerometer/Gyroscope) ---
        device_angle_x = max(0, min(90, random.gauss(*profile['device_angle_x_deg']))) # Angle between 0 and 90
        accel_x_std = random.gauss(*profile['accel_std_xyz']) if not is_fraudulent else (random.gauss(*profile['accel_std_xyz_static']) if random.random() < 0.5 else random.gauss(*profile['accel_std_xyz_erratic']))
        accel_y_std = random.gauss(*profile['accel_std_xyz']) if not is_fraudulent else (random.gauss(*profile['accel_std_xyz_static']) if random.random() < 0.5 else random.gauss(*profile['accel_std_xyz_erratic']))
        accel_z_std = random.gauss(*profile['accel_std_xyz']) if not is_fraudulent else (random.gauss(*profile['accel_std_xyz_static']) if random.random() < 0.5 else random.gauss(*profile['accel_std_xyz_erratic']))
        gyro_z_avg = random.gauss(*profile['gyro_z_avg_dps'])

        # --- App Usage Patterns ---
        nav_flow_speed = max(0.1, random.gauss(*profile['nav_flow_speed_spm']))
        screen_time_seconds = max(1, random.gauss(*profile['screen_time_seconds']))

        # --- Transaction Data ---
        transaction_amount = max(1, random.gauss(*profile['transaction_amount']))

        # --- GPS and Location ---
        simulated_speed_mps = random.gauss(*profile['gps_speed_kmh_base']) / 3.6 # Convert to m/s for movement calc
        simulated_bearing = random.uniform(0, 360) # degrees

        # Simulate normal movement patterns (e.g., travel)
        if not is_fraudulent and i % 50 == 0 and random.random() < 0.6: # Simulate occasional "travel"
            simulated_speed_mps = random.gauss(AVG_CAR_SPEED_LOCAL / 3.6, 2) # m/s
            simulated_bearing = random.uniform(0, 360)

        delta_lat = (simulated_speed_mps / 111139) * math.cos(math.radians(simulated_bearing)) # rough conversion
        delta_lon = (simulated_speed_mps / (111139 * math.cos(math.radians(current_lat)))) * math.sin(math.radians(simulated_bearing))

        current_lat += delta_lat
        current_lon += delta_lon

        dist_moved_km = haversine_distance(prev_lat, prev_lon, current_lat, current_lon)
        time_diff_seconds = (timestamp - pd.to_datetime(data[-1]['timestamp'])).total_seconds() if i > 0 else 1
        instant_speed_kmh = (dist_moved_km / time_diff_seconds) * 3600 if time_diff_seconds > 0 else 0


        # --- Contextual Data ---
        # Time of Day Anomaly: e.g., activity outside typical hours (simulated based on timestamp)
        hour = timestamp.hour
        time_of_day_anomaly = 1 if (hour < 6 or hour > 22) else 0 # 10 PM to 6 AM considered anomalous

        # IP Address (conceptual, not real IPs)
        ip_address = profile['ip_segment_base'] + str(random.randint(1, 254)) + '.' + str(random.randint(1, 254))
        # Network Type
        network_type = random.choice(profile['network_type_dist'])

        # --- Introduce specific fraud anomalies if requested ---
        if is_fraudulent:
            if random.random() < 0.3: # Chance of impossible GPS jump
                current_lat = random.uniform(-90, 90)
                current_lon = random.uniform(-180, 180)
                instant_speed_kmh = random.gauss(5000, 1000) # Impossible speed
            if random.random() < 0.2: # Chance of IP mismatch or VPN/Proxy use
                ip_address = random.choice(['203.0.113.', '198.51.100.']) + str(random.randint(1, 254)) + '.' + str(random.randint(1, 254)) # Different segment
                network_type = random.choice(['VPN', 'Proxy']) # Explicitly mark anomalous network

        record = {
            'user_id': user_id,
            'timestamp': timestamp,
            # Typing
            'typing_speed_wpm': typing_speed,
            'key_press_duration_ms': key_press_duration,
            'time_between_keys_ms': time_between_keys,
            # Swipe/Tap
            'swipe_pressure_norm': swipe_pressure,
            'tap_duration_ms': tap_duration,
            'tap_count_per_min': tap_count_per_min,
            'swipe_direction_angle_degrees': swipe_direction_angle,
            # Device Orientation/Movement
            'device_angle_x_deg': device_angle_x,
            'accel_x_std': accel_x_std,
            'accel_y_std': accel_y_std,
            'accel_z_std': accel_z_std,
            'gyro_z_avg_dps': gyro_z_avg,
            # App Usage
            'nav_flow_speed_spm': nav_flow_speed,
            'screen_time_seconds': screen_time_seconds,
            # Transaction (could be part of app usage but often distinct)
            'transaction_amount': transaction_amount,
            # GPS/Location
            'current_lat': current_lat,
            'current_lon': current_lon,
            'instant_speed_kmh': instant_speed_kmh,
            'simulated_bearing': simulated_bearing,
            # Contextual (Derived/Network)
            'time_of_day_anomaly': time_of_day_anomaly,
            'ip_address_segment': ip_address.split('.')[0] + '.' + ip_address.split('.')[1], # Use segment for analysis
            'network_type': network_type,
            'is_fraud': is_fraudulent # For simulation and evaluation
        }
        data.append(record)
        prev_lat, prev_lon = current_lat, current_lon

    return pd.DataFrame(data)

# Generate some normal user data
# User A starts in Jalandhar, User B starts somewhere else (simulated)
normal_user_data_A = simulate_user_behavior_data(500, "user_A", is_fraudulent=False, start_lat=31.3260, start_lon=75.5762) # Jalandhar coords
normal_user_data_B = simulate_user_behavior_data(500, "user_B", is_fraudulent=False, start_lat=28.7041, start_lon=77.1025) # Delhi coords

# Generate some fraudulent data for user A
fraud_user_data_A = simulate_user_behavior_data(50, "user_A", is_fraudulent=True, start_lat=31.3260, start_lon=75.5762)

# Combine for a simulation dataset
simulated_data = pd.concat([normal_user_data_A, normal_user_data_B, fraud_user_data_A], ignore_index=True)
print("Simulated Data Sample:")
print(simulated_data.head().transpose()) # Transpose for better readability of many features
print(f"\nTotal samples: {len(simulated_data)}")
print(f"Fraudulent samples: {simulated_data['is_fraud'].sum()}")

# --- Features for ML models (UPDATED LIST) ---
# Behavioral Features (numeric for ML models)
behavioral_features = [
    'typing_speed_wpm', 'key_press_duration_ms', 'time_between_keys_ms',
    'swipe_pressure_norm', 'tap_duration_ms', 'tap_count_per_min', 'swipe_direction_angle_degrees',
    'device_angle_x_deg', 'accel_x_std', 'accel_y_std', 'accel_z_std', 'gyro_z_avg_dps',
    'nav_flow_speed_spm', 'screen_time_seconds', 'transaction_amount',
    'instant_speed_kmh', 'simulated_bearing'
]
# Contextual Features (can be used separately or encoded for ML)
contextual_features = [
    'time_of_day_anomaly', 'ip_address_segment', 'network_type'
]

# Scale only behavioral features for the ML models
# This part is included because the 'behavioral_features' list is defined here,
# and it's essential to show how data prepared by this function would be used.
# The actual scaling operation is distinct from the data generation itself.
scaler = StandardScaler()
# Note: In a real scenario, you'd fit the scaler on a large set of normal,
# representative user data before transforming.
scaled_normal_data_A = scaler.fit_transform(normal_user_data_A[behavioral_features])

print("\nShape of scaled behavioral features for User A (normal):", scaled_normal_data_A.shape)
print("First 5 rows of scaled behavioral features (User A):")

(pd.DataFrame(scaled_normal_data_A, columns=behavioral_features).head())

In [None]:
simulated_data.sample(5)

In [None]:
simulated_data.info()

In [None]:
# hardcore rule for small dataset
# use gpt for each input check all aspect
# define the well suitable/ dynamic code to update the threshold
