In [2]:
import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta
import random

User_Name = [ 'Aarav Sharma', 'Vivaan Singh', 'Reyansh Mehta', 'Aditya Verma', 'Arjun Yadav',
             'Ishaan Patel', 'Kabir Malhotra', 'Krishna Gupta', 'Atharv Reddy', 'Shaurya Bhat',
              'Aanya Kapoor', 'Anika Jain', 'Kiara Bose', 'Saanvi Iyer', 'Avni Mishra']

def generate_login_attempts(n, contamination):
    data = []
    for _ in range(n):
        login_time = datetime.now() - timedelta(minutes=random.randint(0, 10000))
        session_duration = random.randint(5, 120)  # minutes
        logout_time = login_time + timedelta(minutes=session_duration)

        entry = {
            "attempt_id": str(uuid.uuid4()),
            "User_Name": random.choice(User_Name),
            "login_time": login_time.isoformat(),
            "logout_time": logout_time.isoformat(),
            "session_duration_min": session_duration,
            "success": True,
            "ip_address": f"192.168.{random.randint(0,255)}.{random.randint(0,255)}"
        }
        data.append(entry)

    # Introduce contamination: flip success flag and corrupt session times for contamination% of data
    contam_count = int(n * contamination)
    indices = np.random.choice(range(n), contam_count, replace=False)
    for i in indices:
        data[i]["success"] = not data[i]["success"]
        # Corrupt session: make logout before login or very long session
        if data[i]["success"] == False:
            data[i]["logout_time"] = data[i]["login_time"][:16] + ":00"  # Force same minute
            data[i]["session_duration_min"] = 0
            data[i]["ip_address"] = "203.0.113." + str(random.randint(1,255))

    return data

def generate_file_downloads(n, contamination):
    # 80% chance of selecting .exe file
    file_types = ['exe', 'pdf', 'jpg', 'docx', 'mp4']
    # weights = [0.8, 0.05, 0.05, 0.05, 0.05]  # 80% for exe, others share 20%
    weights = [80, 5, 5, 5, 5]

    data = []

    for _ in range(n):
        file_ext = random.choices(file_types, weights=weights, k=1)[0]  # weighted choice

        # Mark exe files as high priority
        priority = "High" if file_ext == "exe" else "Normal"

        entry = {
            "download_id": str(uuid.uuid4()),
            "user_id": str(uuid.uuid4()),
            "file_name": f"file_{random.randint(1, 1000)}.{file_ext}",
            "timestamp": (datetime.now() - timedelta(minutes=random.randint(0, 10000))).isoformat(),
            "file_size_kb": random.randint(10, 5000),
            "success": True,
            "priority": priority
        }
        data.append(entry)

    # Contamination: mark some downloads as failed
    contam_count = int(n * contamination)
    for i in np.random.choice(range(n), contam_count, replace=False):
        data[i]["success"] = False
        if data[i]["success"] == False:
          data[i]["file_size_kb"] = random.randint(6000,12000)

    return data

def generate_device_endpoints(n, contamination):
    import random
    import uuid
    import numpy as np
    from datetime import datetime, timedelta

    # Allowed Device Types per Brand
    brand_device_map = {
        "HP": ["pc"],
        "Dell": ["pc"],
        "Apple": ["pc", "tablet", "smartphone"],
        "Samsung": ["smartphone", "tablet"]
    }

    # Allowed OSs per Brand
    brand_os_map = {
        "HP": ["Windows", "Linux"],
        "Dell": ["Windows", "Linux"],
        "Apple": ["macOS", "iOS"],
        "Samsung": ["Android"]
    }

    # MAC prefixes per Brand
    mac_prefix = {
        "HP": "00:16:3E",
        "Dell": "00:0F:4B",
        "Apple": "B8:27:EB",
        "Samsung": "F0:1F:AF"
    }

    data = []

    def generate_mac(oui):
        nic = ':'.join(f"{random.randint(0,255):02x}" for _ in range(3))
        return f"{oui}:{nic}"

    for _ in range(n):

        # Select Brand first to avoid wrong OS
        brand = random.choice(list(brand_device_map.keys()))

        # Select device based on allowed devices for the brand
        device_type = random.choice(brand_device_map[brand])

        # Select OS based on the selected brand
        os_choice = random.choice(brand_os_map[brand])

        entry = {
            "Brand": brand,
            "User_Name": np.random.choice(User_Name),
            "device_type": device_type,
            "os": os_choice,
            "mac_address": generate_mac(mac_prefix[brand]),
            "ip_address": f"10.0.{random.randint(0,255)}.{random.randint(0,255)}",
            "last_seen": (datetime.now() - timedelta(days=random.randint(0, 365))).isoformat(),
            "active": True
        }
        data.append(entry)

    # Contamination: set some devices as inactive and spoof MAC addresses
    contam_count = int(n * contamination)
    indices = np.random.choice(range(n), contam_count, replace=False)
    for i in indices:
        data[i]["active"] = False
        if data[i]["active"] == False:
          data[i]["ip_address"] =  "203.0.113." + str(random.randint(1,255))
          data[i]["mac_address"] = "00:00:00:00:00:00"  # Spoofed MAC

    return data

def main():
    print("=== Data Generation Engine ===")
    print("Select the type of data to generate:")
    print("1. Login Attempts (with login/logout times)")
    print("2. File Downloads")
    print("3. Device Endpoints (with MAC addresses)")

    choice = input("\nEnter choice number (1-3): ")
    if choice not in ['1', '2', '3']:
        print("Invalid choice!")
        return

    try:
        count = int(input("Enter number of data entries to generate: "))
        if count <= 0:
            raise ValueError("Count must be positive")

        anomaly = int(input("Enter anomaly percentage (0 to 100): "))
        contamination = anomaly/100
        if contamination < 0 or contamination > 1:
            print("Contamination must be between 0 and 1")
            return
    except ValueError as e:
        print(f"Invalid input: {e}")
        return

    print(f"\nGenerating {count} entries with {contamination:.1%} contamination...")

    if choice == '1':
        data = generate_login_attempts(count, contamination)
        print("\nSample Login Attempts:")
    elif choice == '2':
        data = generate_file_downloads(count, contamination)
        print("\nSample File Downloads:")
    elif choice == '3':
        data = generate_device_endpoints(count, contamination)
        print("\nSample De2vice Endpoints:")

    # Display as DataFrame
    df = pd.DataFrame(data)
    #print(df.head(10).to_string(index=False))
    print(df)
    #print(df.loc[df['active']==False])

    filename = f"{['login_attempts', 'file_downloads', 'device_endpoints'][int(choice)-1]}.json"      # Save to JSON
    with open(filename, 'a') as f:
        json.dump(data, f, indent=2)

    print(f"\nâœ… Data saved to {filename}")
    print(f"ðŸ“Š Total entries: {len(data)}")
    print(f"ðŸ”¥ Contaminated entries: {int(len(data) * contamination)}")

if __name__ == "__main__":
  main()


=== Data Generation Engine ===
Select the type of data to generate:
1. Login Attempts (with login/logout times)
2. File Downloads
3. Device Endpoints (with MAC addresses)

Enter choice number (1-3): 3
Enter number of data entries to generate: 50
Enter anomaly percentage (0 to 100): 10

Generating 50 entries with 10.0% contamination...

Sample De2vice Endpoints:
      Brand       User_Name device_type       os        mac_address  \
0        HP   Reyansh Mehta          pc  Windows  00:16:3E:d6:2d:3b   
1      Dell    Vivaan Singh          pc    Linux  00:0F:4B:27:50:e9   
2     Apple    Aanya Kapoor      tablet    macOS  B8:27:EB:b7:1d:08   
3   Samsung    Ishaan Patel  smartphone  Android  F0:1F:AF:c3:da:c3   
4      Dell      Kiara Bose          pc    Linux  00:0F:4B:93:c6:d0   
5   Samsung   Krishna Gupta  smartphone  Android  F0:1F:AF:2c:06:2b   
6     Apple    Aditya Verma          pc      iOS  B8:27:EB:1c:4f:65   
7        HP   Reyansh Mehta          pc    Linux  00:16:3E:0d:01:f0 

In [None]:
#Module 1st

import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta

# Configuration
NUM_RECORDS = 10
ANOMALY_RATIO = 0.05

np.random.seed(42)

# Time window
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=30)

# Sample pools
users = ['sci_01', 'sci_02', 'admin_01', 'admin_02']
ttys_normal = ['pts/0', 'pts/1', 'tty1']
ttys_anomaly = ['pts/99', 'pts/256']
auth_methods_sop = ['ssh-rsa', 'Kerberos_V5']
auth_methods_anomaly = ['password', 'NTLMv1']
internal_ips = [f"10.0.0.{i}" for i in range(1, 50)]
external_ips = ['203.0.113.5', '198.51.100.7']

# Helper functions
def random_time():
    delta = end_time - start_time
    rand_seconds = np.random.randint(0, int(delta.total_seconds()))
    return (start_time + timedelta(seconds=rand_seconds)).replace(microsecond=0).isoformat() + 'Z'

def generate_record(is_anomaly=False):
    user = np.random.choice(users)
    session_id = str(uuid.uuid4())
    timestamp = random_time()
    hour = int(timestamp[11:13])

    if not is_anomaly:
        tty = np.random.choice(ttys_normal)
        auth = np.random.choice(auth_methods_sop)
        ip = np.random.choice(internal_ips)
        hour = np.random.randint(9, 19)
        timestamp = timestamp[:11] + f"{hour:02}:00:00Z"
        ground_truth = "NORMAL"
    else:
        anomaly_type = np.random.choice(['legacy', 'ghost', 'offhours'])
        if anomaly_type == 'legacy':
            auth = np.random.choice(auth_methods_anomaly)
            tty = np.random.choice(ttys_normal)
            hour = np.random.randint(9, 19)
        elif anomaly_type == 'ghost':
            tty = np.random.choice(ttys_anomaly)
            auth = np.random.choice(auth_methods_sop)
            hour = np.random.randint(9, 19)
        else:  # offhours
            tty = np.random.choice(ttys_normal)
            auth = np.random.choice(auth_methods_sop)
            hour = np.random.choice([2, 3])
        ip = np.random.choice(external_ips)
        timestamp = timestamp[:11] + f"{hour:02}:00:00Z"
        ground_truth = "ANOMALY"

    return {
        "timestamp_iso": timestamp,
        "user_id": user,
        "session_id": session_id,
        "tty_line": tty,
        "auth_protocol": auth,
        "session_pid": np.random.randint(1000, 9999),
        "source_ip": ip,
        "ground_truth": ground_truth
    }

# Generate data
records = []
for i in range(NUM_RECORDS):
    is_anomaly = (i % int(1 / ANOMALY_RATIO) == 0)
    records.append(generate_record(is_anomaly))

# Convert to DataFrame
df = pd.DataFrame(records)

# print(df.tail(20))
# Export to NDJSON
with open("auth_telemetry.json", "w") as f:
    for record in df.to_dict(orient="records"):
        f.write(json.dumps(record) + "\n")

with open("auth_telemetry.json", "r") as f:
  print(f.read())

{"timestamp_iso": "2025-11-25T02:00:00Z", "user_id": "admin_01", "session_id": "c30dc35f-2f50-4aa6-a18b-00ca4224bdaa", "tty_line": "tty1", "auth_protocol": "Kerberos_V5", "session_pid": 6734, "source_ip": "203.0.113.5", "ground_truth": "ANOMALY"}
{"timestamp_iso": "2025-11-17T13:00:00Z", "user_id": "sci_02", "session_id": "1e9d6eef-5c5e-4977-b759-6d38a6bcd329", "tty_line": "tty1", "auth_protocol": "ssh-rsa", "session_pid": 9322, "source_ip": "10.0.0.24", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-08T13:00:00Z", "user_id": "sci_02", "session_id": "a4c2083f-8b5a-4bc6-b13b-5fb796a7edc7", "tty_line": "pts/1", "auth_protocol": "Kerberos_V5", "session_pid": 2184, "source_ip": "10.0.0.2", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-15T17:00:00Z", "user_id": "admin_02", "session_id": "227d6ed5-aeed-4a00-9cca-97529825b1f6", "tty_line": "pts/1", "auth_protocol": "ssh-rsa", "session_pid": 9666, "source_ip": "10.0.0.44", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-21T11

  end_time = datetime.utcnow()


In [None]:
#Module 2 VPN perimeter

import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta

NUM_RECORDS = 10
ANOMALY_RATIO = 0.05
np.random.seed(42)

end_time = datetime.utcnow()
start_time = end_time - timedelta(days=30)

users = ['sci_01', 'sci_02', 'admin_01', 'admin_02']
cities = {
    'Mumbai': (19.0760, 72.8777),
    'Delhi': (28.6139, 77.2090),
    'Bangalore': (12.9716, 77.5946),
    'Hyderabad': (17.3850, 78.4867)
}
anomaly_locations = {
    'Beijing': (39.9042, 116.4074),
    'Moscow': (55.7558, 37.6173),
    'New York': (40.7128, -74.0060),
    'Islamabad': (33.6844, 73.0479)
}
vpn_versions = ['5.2.1', '5.3.0', '5.4.2']

def random_time():
    delta = end_time - start_time
    rand_seconds = np.random.randint(0, int(delta.total_seconds()))
    return (start_time + timedelta(seconds=rand_seconds)).replace(microsecond=0).isoformat() + 'Z'

def generate_record(is_anomaly=False):
    user = np.random.choice(users)
    session_id = str(uuid.uuid4())
    timestamp = random_time()
    client_version = np.random.choice(vpn_versions)

    if not is_anomaly:
        city = np.random.choice(list(cities.keys()))
        lat, lon = cities[city]
        tunnel_mode = 'Full Tunnel'
        device_status = 'device_compliant'
        bytes_sent = np.random.ra-ndint(100000, 500000)
        split_tunnel = False
        ground_truth = "NORMAL"
    else:
        anomaly_type = np.random.choice(['geo', 'split', 'exfil'])
        if anomaly_type == 'geo':
            city = np.random.choice(list(anomaly_locations.keys()))
            lat, lon = anomaly_locations[city]
            tunnel_mode = 'Full Tunnel'
            device_status = 'device_compliant'
            split_tunnel = False
            bytes_sent = np.random.randint(100000, 400000)
        elif anomaly_type == 'split':
            city = np.random.choice(list(cities.keys()))
            lat, lon = cities[city]
            tunnel_mode = 'Split Tunnel'
            split_tunnel = True
            device_status = 'device_compliant'
            bytes_sent = np.random.randint(100000, 400000)
        else:  # exfiltration
            city = np.random.choice(list(cities.keys()))
            lat, lon = cities[city]
            tunnel_mode = 'Full Tunnel'
            split_tunnel = False
            device_status = 'device_compliant'
            bytes_sent = np.random.randint(600000000, 1000000000)
        ground_truth = "ANOMALY"

    return {
        "timestamp_iso": timestamp,
        "user_id": user,
        "session_id": session_id,
        "client_version": client_version,
        "geo_coordinates": f"{lat},{lon}",
        "tunnel_mode": tunnel_mode,
        "device_status": device_status,
        "split_tunnel_detected": split_tunnel,
        "bytes_sent": bytes_sent,
        "ground_truth": ground_truth
    }

records = [generate_record(i % int(1 / ANOMALY_RATIO) == 0) for i in range(NUM_RECORDS)]
df = pd.DataFrame(records)
# print(df)
with open("vpn_perimeter.json", "w") as vpn:
    for record in df.to_dict(orient="records"):
        vpn.write(json.dumps(record) + "\n")

with open("vpn_perimeter.json", "r") as vpn:
  print(vpn.read())


{"timestamp_iso": "2025-11-24T06:43:15Z", "user_id": "admin_01", "session_id": "11e001f7-5407-401c-8abc-a6c3ee11afed", "client_version": "5.4.2", "geo_coordinates": "17.385,78.4867", "tunnel_mode": "Full Tunnel", "device_status": "device_compliant", "split_tunnel_detected": false, "bytes_sent": 733224038, "ground_truth": "ANOMALY"}
{"timestamp_iso": "2025-11-16T15:38:37Z", "user_id": "sci_02", "session_id": "6523eee8-bd07-4cdf-88d2-d147d184d268", "client_version": "5.4.2", "geo_coordinates": "12.9716,77.5946", "tunnel_mode": "Full Tunnel", "device_status": "device_compliant", "split_tunnel_detected": false, "bytes_sent": 474871, "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-24T18:44:26Z", "user_id": "sci_01", "session_id": "87bba05a-bcf1-4012-ab03-8c4a8833db4c", "client_version": "5.4.2", "geo_coordinates": "28.6139,77.209", "tunnel_mode": "Full Tunnel", "device_status": "device_compliant", "split_tunnel_detected": false, "bytes_sent": 164820, "ground_truth": "NORMAL"}
{"timesta

  end_time = datetime.utcnow()


In [None]:
#Module 3

import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta

NUM_RECORDS = 10
ANOMALY_RATIO = 0.05
np.random.seed(42)

end_time = datetime.utcnow()
start_time = end_time - timedelta(days=30)

users = ['sci_01', 'sci_02', 'admin_01', 'admin_02']
devices = {
    'sci_01': 'HWID_001',
    'sci_02': 'HWID_002',
    'admin_01': 'HWID_003',
    'admin_02': 'HWID_004'
}
guest_devices = ['BYOD_101', 'GUEST_TAB_202']
resources = ['/home/sci_01/report.docx', '/shared/projectX/data.csv', '/usr/bin/python3']
forbidden = ['/etc/shadow', '/vault/passwords.kdbx', '/confidential/espionage/plan.txt']
processes_normal = ['explorer.exe', 'nautilus', 'code']
processes_anomaly = ['cmd.exe', 'nc']

def random_time():
    delta = end_time - start_time
    rand_seconds = np.random.randint(0, int(delta.total_seconds()))
    return (start_time + timedelta(seconds=rand_seconds)).replace(microsecond=0).isoformat() + 'Z'

def generate_record(is_anomaly=False):
    user = np.random.choice(list(devices.keys()))
    session_id = str(uuid.uuid4())
    timestamp = random_time()

    if not is_anomaly:
        device = devices[user]
        inode = np.random.choice(resources)
        process = np.random.choice(processes_normal)
        ground_truth = "NORMAL"
    else:
        anomaly_type = np.random.choice(['device_mismatch', 'forbidden_inode', 'process_anomaly'])
        if anomaly_type == 'device_mismatch':
            device = np.random.choice(guest_devices)
            inode = np.random.choice(resources)
            process = np.random.choice(processes_normal)
        elif anomaly_type == 'forbidden_inode':
            device = devices[user]
            inode = np.random.choice(forbidden)
            process = np.random.choice(processes_normal)
        else:  # process anomaly
            device = devices[user]
            inode = np.random.choice(resources)
            process = np.random.choice(processes_anomaly)
        ground_truth = "ANOMALY"

    return {
        "timestamp_iso": timestamp,
        "user_id": user,
        "session_id": session_id,
        "device_hwid": device,
        "inode_path": inode,
        "process_parent": process,
        "ground_truth": ground_truth
    }

records = [generate_record(i % int(1 / ANOMALY_RATIO) == 0) for i in range(NUM_RECORDS)]
df = pd.DataFrame(records)

with open("resource_access.json", "w") as Resource:
    for record in df.to_dict(orient="records"):
        Resource.write(json.dumps(record) + "\n")
with open("resource_access.json", "r") as Resource:
  print(Resource.read())

{"timestamp_iso": "2025-11-24T06:42:41Z", "user_id": "admin_01", "session_id": "5a3fa3e2-d2cf-4883-9f37-afa0c95ee57c", "device_hwid": "HWID_003", "inode_path": "/usr/bin/python3", "process_parent": "nc", "ground_truth": "ANOMALY"}
{"timestamp_iso": "2025-11-06T22:54:17Z", "user_id": "sci_01", "session_id": "cc9f18a6-f8d3-421c-a6c3-a362cb061b4b", "device_hwid": "HWID_001", "inode_path": "/usr/bin/python3", "process_parent": "nautilus", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-16T15:38:03Z", "user_id": "admin_01", "session_id": "3c4bd959-c44a-4ffc-9a14-cd13b652a422", "device_hwid": "HWID_003", "inode_path": "/usr/bin/python3", "process_parent": "code", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-24T18:43:52Z", "user_id": "admin_02", "session_id": "f907af27-495d-46a2-9a57-ccce39db2987", "device_hwid": "HWID_004", "inode_path": "/usr/bin/python3", "process_parent": "nautilus", "ground_truth": "NORMAL"}
{"timestamp_iso": "2025-11-07T14:11:18Z", "user_id": "sci_01", "ses

  end_time = datetime.utcnow()


In [None]:
import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta
import random

def generate_login_attempts(n, contamination):
    data = []
    for _ in range(n):
        entry = {
            "attempt_id": str(uuid.uuid4()),
            "user_id": str(uuid.uuid4()),
            "timestamp": (datetime.now() - timedelta(minutes=random.randint(0, 10000))).isoformat(),
            "success": True,
            "ip_address": f"192.168.{random.randint(0,255)}.{random.randint(0,255)}"
        }
        data.append(entry)
    # Introduce contamination: flip success flag for contamination% of data
    contam_count = int(n * contamination)
    for i in np.random.choice(range(n), contam_count, replace=False):
        data[i]["success"] = not data[i]["success"]
    return data

# def generate_file_downloads(n, contamination):
#     file_types = ['pdf', 'jpg', 'exe', 'docx', 'mp4']
#     data = []
#     for _ in range(n):

#         entry = {
#             "download_id": str(uuid.uuid4()),
#             "user_id": str(uuid.uuid4()),
#             "file_name": f"file_{random.randint(1, 1000)}.{random.choice(file_types)}",
#             "timestamp": (datetime.now() - timedelta(minutes=random.randint(0, 10000))).isoformat(),
#             "file_size_kb": random.randint(10, 5000),
#             "success": True
#         }
#         data.append(entry)
#     # Contamination: mark some downloads as failed
#     contam_count = int(n * contamination)
#     for i in np.random.choice(range(n), contam_count, replace=False):
#         data[i]["success"] = False
#     return data

def generate_file_downloads(n, contamination):
    file_types = ['pdf', 'jpg', 'exe', 'docx', 'mp4']
    data = []
    for _ in range(n):

        file_ext = random.choice(file_types)
        priority = "High" if file_ext == "exe" else "Normal"

        entry = {
            "download_id": str(uuid.uuid4()),
            "user_id": str(uuid.uuid4()),
            "file_name": f"file_{random.randint(1, 1000)}.{file_ext}",
            "timestamp": (datetime.now() - timedelta(minutes=random.randint(0, 10000))).isoformat(),
            "file_size_kb": random.randint(10, 5000),
            "success": True,
            "priority": priority
        }
        data.append(entry)

    # Contamination: mark some downloads as failed
    contam_count = int(n * contamination)
    for i in np.random.choice(range(n), contam_count, replace=False):
        data[i]["success"] = False
        if data[i]["success"] == False:
          data[i]["file_size_kb"] = random.randint(6000,12000)

    return data

def generate_device_endpoints(n, contamination):
    device_types = ['pc', 'tablet', 'smartphone']
    os_types = ['Windows', 'iOS', 'Android', 'Linux', 'macOS']
    data = []
    for _ in range(n):
        entry = {
            "device_id": str(uuid.uuid4()),
            "user_id": str(uuid.uuid4()),
            "device_type": random.choice(device_types),
            "os": random.choice(os_types),
            "ip_address": f"10.0.{random.randint(0,255)}.{random.randint(0,255)}",
            "last_seen": (datetime.now() - timedelta(days=random.randint(0, 365))).isoformat(),
            "active": True
        }
        data.append(entry)
    # Contamination: set some devices as inactive incorrectly
    contam_count = int(n * contamination)
    for i in np.random.choice(range(n), contam_count, replace=False):
        data[i]["active"] = False
    return data

def main():
    print("Select the type of data to generate:")
    print("1. Login Attempts")
    print("2. File Downloads")
    print("3. Device Endpoints")

    choice = input("Enter choice number: ")
    if choice not in ['1', '2', '3']:
        print("Invalid choice")
        return

    try:
        count = int(input("Enter the number of data entries to generate: "))
        contamination = float(input("Enter contamination fraction (0 to 1): "))
        if contamination < 0 or contamination > 1:
            print("Contamination must be between 0 and 1")
            return
    except ValueError:
        print("Invalid numeric input")
        return

    if choice == '1':
        data = generate_login_attempts(count, contamination)
    elif choice == '2':
        data = generate_file_downloads(count, contamination)
    elif choice == '3':
        data = generate_device_endpoints(count, contamination)

    df = pd.DataFrame(data)
    print(df)
    # Save to JSON file
    with open('generated_data.json', 'w') as f:
        json.dump(data, f, indent=4)
    print("Data saved to generated_data.json")

if __name__ == "__main__":
    main()


Select the type of data to generate:
1. Login Attempts
2. File Downloads
3. Device Endpoints
Enter choice number: 2
Enter the number of data entries to generate: 10
Enter contamination fraction (0 to 1): .2
                            download_id                               user_id  \
0  d22ac906-f210-4cf3-a9f2-94f77e3ada7c  7d3c14b6-6ce3-4f82-8f74-ac9ab54c3f85   
1  d44553ac-807a-4b25-b45f-7de95095dce2  8bac5008-4656-42e4-a99a-9ef14c7fe541   
2  e286bc7c-d0a0-4d02-ad7d-48e842fdf51e  d73736cc-24ca-485e-94b8-754b18d5bd43   
3  b0dd5445-bc22-41c5-91f6-37b0e034a830  241e1d40-383d-4f72-b6b2-a41343a4fa7b   
4  5b0a2400-6dc9-46cf-8b3a-b0f2ae145925  43547d51-287a-4f1c-82ea-3cb7c9f8cf7e   
5  3d150187-438e-4322-af48-a4f25f9b3182  c80c850e-2025-412c-8288-cecda178dad0   
6  5a1a9cbe-35ca-437e-bbed-e7772b360c9d  8b8d46f2-eae7-46ba-80f7-7571a1c58262   
7  1c5c083f-3c3f-4cb2-bec6-a3dd49e07a52  8387ebb9-cbd1-491c-9c33-afc7a97453be   
8  ae198c88-f1a9-4e2e-b5c0-79246c73ae8c  e51f4e2a-95ec-45a2-8012

In [None]:
import numpy as np
import pandas as pd
import uuid
import json
from datetime import datetime, timedelta
import random

User_Name = [ 'Aarav Sharma', 'Vivaan Singh', 'Reyansh Mehta', 'Aditya Verma', 'Arjun Yadav',
             'Ishaan Patel', 'Kabir Malhotra', 'Krishna Gupta', 'Atharv Reddy', 'Shaurya Bhat',
              'Aanya Kapoor', 'Anika Jain', 'Kiara Bose', 'Saanvi Iyer', 'Avni Mishra']

def generate_login_attempts(n, contamination):
    data = []
    for _ in range(n):
        login_time = datetime.now() - timedelta(minutes=random.randint(0, 10000))
        session_duration = random.randint(5, 120)  # minutes
        logout_time = login_time + timedelta(minutes=session_duration)

        entry = {
            "attempt_id": str(uuid.uuid4()),
            "User_Name": random.choice(User_Name),
            "login_time": login_time.isoformat(),
            "logout_time": logout_time.isoformat(),
            "session_duration_min": session_duration,
            "success": True,
            "ip_address": f"192.168.{random.randint(0,255)}.{random.randint(0,255)}"
        }
        data.append(entry)

    # Introduce contamination: flip success flag and corrupt session times for contamination% of data
    contam_count = int(n * contamination)
    indices = np.random.choice(range(n), contam_count, replace=False)
    for i in indices:
        data[i]["success"] = not data[i]["success"]
        # Corrupt session: make logout before login or very long session
        if data[i]["success"] == False:
            data[i]["logout_time"] = data[i]["login_time"][:16] + ":00"  # Force same minute
            data[i]["session_duration_min"] = 0
            data[i]["ip_address"] = "203.0.113." + str(random.randint(1,255))

    return data

def generate_file_downloads(n, contamination):
    file_types = ['pdf', 'jpg', 'exe', 'docx', 'mp4']
    data = []
    for _ in range(n):

        file_ext = random.choice(file_types)
        priority = "High" if file_ext == "exe" else "Normal"

        entry = {
            "download_id": str(uuid.uuid4()),
            "User_Name": random.choice(User_Name),
            "file_name": f"file_{random.randint(1, 1000)}.{file_ext}",
            "timestamp": (datetime.now() - timedelta(minutes=random.randint(0, 10000))).isoformat(),
            "file_size_kb": random.randint(10, 5000),
            "success": True,
            "priority": priority
        }
        data.append(entry)

    # Contamination: mark some downloads as failed
    contam_count = int(n * contamination)
    for i in np.random.choice(range(n), contam_count, replace=False):
        data[i]["success"] = False
        if data[i]["success"] == False:
          data[i]["file_size_kb"] = random.randint(6000,12000)

    return data

def generate_device_endpoints(n, contamination):
     mac ={
         "HP": '00:16:3E',
         "Dell" :'00:0F:4B',
         "Apple" : 'B8:27:EB',
         "Samsung" : 'F0:1F:AF'
     }
     device_types = ['pc', 'tablet', 'smartphone']
     os_types = ['Windows', 'iOS', 'Android', 'Linux', 'macOS']
     data = []
     #ignore the last three hexadecimal positions in mac address
     def generate_real_mac(oui):
            nic = ':'.join(['%02x' % random.randint(0, 255) for _ in range(3)])
            return oui + ':' + nic

     for _ in range(n):
       brand, oui = random.choice(list(mac.items()))
       entry = {
            "Brand" : brand,
            "User_Name": np.random.choice(User_Name),
            "device_type": random.choice(device_types),
            #"os": random.choice(os_types)
            "mac_address": generate_real_mac(oui),
            "ip_address": f"10.0.{random.randint(0,255)}.{random.randint(0,255)}",
            "last_seen": (datetime.now() - timedelta(days=random.randint(0, 365))).isoformat(),
            "active": True
       }
       data.append(entry)

     # Contamination: set some devices as inactive and spoof MAC addresses
     contam_count = int(n * contamination)
     indices = np.random.choice(range(n), contam_count, replace=False)
     for i in indices:
        data[i]["active"] = False
        data[i]["ip_address"] =  "203.0.113." + str(random.randint(1,255))
        data[i]["mac_address"] = "00:00:00:00:00:00"  # Spoofed MAC

     return data

def main():
    print("=== Data Generation Engine ===")
    print("Select the type of data to generate:")
    print("1. Login Attempts (with login/logout times)")
    print("2. File Downloads")
    print("3. Device Endpoints (with MAC addresses)")

    choice = input("\nEnter choice number (1-3): ")
    if choice not in ['1', '2', '3']:
        print("Invalid choice!")
        return

    try:
        count = int(input("Enter number of data entries to generate: "))
        if count <= 0:
            raise ValueError("Count must be positive")

        contamination = float(input("Enter contamination fraction (0.0 to 1.0): "))
        if contamination < 0 or contamination > 1:
            print("Contamination must be between 0 and 1")
            return
    except ValueError as e:
        print(f"Invalid input: {e}")
        return

    print(f"\nGenerating {count} entries with {contamination:.1%} contamination...")

    if choice == '1':
        data = generate_login_attempts(count, contamination)
        print("\nSample Login Attempts:")
    elif choice == '2':
        data = generate_file_downloads(count, contamination)
        print("\nSample File Downloads:")
    elif choice == '3':
        data = generate_device_endpoints(count, contamination)
        print("\nSample Device Endpoints:")

    # Display as DataFrame
    df = pd.DataFrame(data)
    #print(df.head(10).to_string(index=False))
    print(df)
    #print(df.loc[df['active']==False])
    # Save to JSON
    filename = f"{['login_attempts', 'file_downloads', 'device_endpoints'][int(choice)-1]}.json"
    with open(filename, 'a') as f:
        json.dump(data, f, indent=2)

    print(f"\nâœ… Data saved to {filename}")
    print(f"ðŸ“Š Total entries: {len(data)}")
    print(f"ðŸ”¥ Contaminated entries: {int(len(data) * contamination)}")

if __name__ == "__main__":
  main()


=== Data Generation Engine ===
Select the type of data to generate:
1. Login Attempts (with login/logout times)
2. File Downloads
3. Device Endpoints (with MAC addresses)

Enter choice number (1-3): 3
Enter number of data entries to generate: 20
Enter contamination fraction (0.0 to 1.0): .2

Generating 20 entries with 20.0% contamination...

Sample Device Endpoints:
      Brand       User_Name device_type        mac_address     ip_address  \
0        HP     Arjun Yadav      tablet  00:16:3E:ff:af:eb     10.0.79.44   
1      Dell    Atharv Reddy      tablet  00:0F:4B:21:8b:13   10.0.233.137   
2     Apple      Kiara Bose          pc  B8:27:EB:b8:ff:44    10.0.129.11   
3   Samsung  Kabir Malhotra          pc  F0:1F:AF:90:df:b9   10.0.185.176   
4   Samsung    Ishaan Patel  smartphone  00:00:00:00:00:00   203.0.113.91   
5   Samsung    Aarav Sharma      tablet  F0:1F:AF:e3:ed:0c   10.0.248.161   
6        HP    Ishaan Patel      tablet  00:16:3E:59:17:3d     10.0.210.6   
7   Samsung    

In [None]:

def generate_device_endpoints(n, contamination):
     mac ={
         "HP": '00:16:3E',
         "Dell" :'00:0F:4B',
         "Apple" : 'B8:27:EB',
         "Samsung" : 'F0:1F:AF'
     }
     device_types = ['pc', 'tablet', 'smartphone']
     os_types = ['Windows', 'iOS', 'Android', 'Linux', 'macOS']
     data = []
     #ignore the last three hexadecimal positions in mac address
     def generate_real_mac(oui):
            nic = ':'.join(['%02x' % random.randint(0, 255) for _ in range(3)])
            return oui + ':' + nic

     for _ in range(n):
       brand, oui = random.choice(list(mac.items()))
       entry = {
            "Brand" : brand,
            "User_Name": np.random.choice(User_Name),
            "device_type": random.choice(device_types),
            #"os": random.choice(os_types)
            "mac_address": generate_real_mac(oui),
            "ip_address": f"10.0.{random.randint(0,255)}.{random.randint(0,255)}",
            "last_seen": (datetime.now() - timedelta(days=random.randint(0, 365))).isoformat(),
            "active": True
       }
       data.append(entry)
     return data

     df=pd.DataFrame(data)
     print(df)


print(generate_device_endpoints(10,.2))



[{'Brand': 'HP', 'User_Name': np.str_('Saanvi Iyer'), 'device_type': 'smartphone', 'mac_address': '00:16:3E:88:40:77', 'ip_address': '10.0.49.244', 'last_seen': '2024-12-11T17:18:25.616567', 'active': True}, {'Brand': 'Samsung', 'User_Name': np.str_('Atharv Reddy'), 'device_type': 'pc', 'mac_address': 'F0:1F:AF:cd:d4:fd', 'ip_address': '10.0.52.42', 'last_seen': '2025-08-20T17:18:25.616666', 'active': True}, {'Brand': 'Samsung', 'User_Name': np.str_('Kiara Bose'), 'device_type': 'smartphone', 'mac_address': 'F0:1F:AF:78:48:97', 'ip_address': '10.0.228.162', 'last_seen': '2025-01-03T17:18:25.616730', 'active': True}, {'Brand': 'HP', 'User_Name': np.str_('Kiara Bose'), 'device_type': 'tablet', 'mac_address': '00:16:3E:43:48:31', 'ip_address': '10.0.68.166', 'last_seen': '2025-02-25T17:18:25.616789', 'active': True}, {'Brand': 'Dell', 'User_Name': np.str_('Shaurya Bhat'), 'device_type': 'pc', 'mac_address': '00:0F:4B:d5:21:1a', 'ip_address': '10.0.188.152', 'last_seen': '2025-09-24T17:18: