In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)

def generate_simple_ip(is_legitimate=True):
    if is_legitimate:
        # Legitimate IP patterns
        patterns = [
            f"192.168.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"10.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"172.{random.randint(16, 31)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"73.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"98.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
        ]
    else:
        # Suspicious IP patterns
        patterns = [
            f"185.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"5.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"45.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"51.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}",
            f"138.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
        ]

    return random.choice(patterns)

def generate_credential_stuffing_data(n):
    data = []

    for i in range(n):
        # Determine if this is a credential stuffing attempt (20% of records)
        is_stuffing = np.random.choice([0, 1], p=[0.8, 0.2])

        if is_stuffing:
            # Credential stuffing pattern
            ip_address = generate_simple_ip(is_legitimate=False)
            attempts_from_ip = np.random.randint(10, 100)
            failed_attempts = np.random.randint(attempts_from_ip - 5, attempts_from_ip + 1)
            failed_attempts = min(failed_attempts, attempts_from_ip)
            success = attempts_from_ip - failed_attempts
            timestamp = datetime.now() - timedelta(minutes=np.random.randint(0, 1440))

        else:
            # Legitimate login pattern
            ip_address = generate_simple_ip(is_legitimate=True)
            attempts_from_ip = np.random.randint(1, 6)
            failed_attempts = np.random.randint(0, min(3, attempts_from_ip))
            success = attempts_from_ip - failed_attempts
            timestamp = datetime.now() - timedelta(
                days=np.random.randint(0, 90),
                hours=np.random.randint(0, 24),
                minutes=np.random.randint(0, 60)
            )


        # Successful logins take longer
        login_duration = np.where(success > 0,
                                np.random.uniform(1.0, 15.0),
                                np.random.uniform(0.1, 2.0))

        # Request size Larger requests for attacks
        if is_stuffing:
            request_size = np.random.randint(1500, 5000)
        else:
            request_size = np.random.randint(200, 1500)

        # Additional features for better detection
        requests_per_minute = np.random.poisson(20 if is_stuffing else 2)

        data.append({
            'timestamp': timestamp,
            'ip_address': ip_address,
            'login_success': success,
            'attempts_from_ip': attempts_from_ip,
            'failed_attempts': failed_attempts,
            'login_duration_seconds': login_duration,
            'request_size_bytes': request_size,
            'requests_per_minute': requests_per_minute,
            'is_credential_stuffing': is_stuffing
        })

    return data


In [3]:
n_records = 20000
dataset = generate_credential_stuffing_data(n_records)

df = pd.DataFrame(dataset)
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
# Display dataset info
print(f"Dataset shape: {df.shape}")
print(f"Credential stuffing cases: {df['is_credential_stuffing'].sum()}")
df

Dataset shape: (20000, 9)
Credential stuffing cases: 3975


Unnamed: 0,timestamp,ip_address,login_success,attempts_from_ip,failed_attempts,login_duration_seconds,request_size_bytes,requests_per_minute,is_credential_stuffing
0,2025-08-18 15:35:44,10.143.16.134,3,5,2,7.241658539950276,287,1,0
1,2025-08-03 00:44:44,73.85.218.128,4,5,1,3.972747549495866,476,2,0
2,2025-09-17 20:59:44,172.26.176.90,1,3,2,7.384979779038503,762,4,0
3,2025-08-01 22:24:44,10.84.223.50,1,2,1,6.3958310355588255,1097,2,0
4,2025-08-10 08:24:44,10.155.102.165,2,3,1,10.275311980955747,765,1,0
...,...,...,...,...,...,...,...,...,...
19995,2025-09-22 21:26:46,10.134.249.95,4,4,0,1.1141424168442473,614,1,0
19996,2025-10-28 14:47:46,45.186.84.60,0,35,35,0.5599865310934679,2481,19,1
19997,2025-10-11 00:50:46,73.230.245.83,4,5,1,6.5902826616243875,485,7,0
19998,2025-10-28 21:44:46,185.80.165.132,0,51,51,0.521734759336722,4762,17,1


In [5]:
df.to_csv("credential_stuffing_detection.csv", index=False)
print("Dataset saved to 'credential_stuffing_detection.csv'")



Dataset saved to 'credential_stuffing_detection.csv'
