In [1]:
print("Environment ready for data generation")


Environment ready for data generation


In [2]:
# STEP 2: Login log data generation

import random
import pandas as pd
from datetime import datetime, timedelta

print("Libraries imported successfully")


Libraries imported successfully


In [1]:
# STEP 2.1: Configuration for data generation

NUM_LOGINS = 500
NUM_USERS = 25
SUSPICIOUS_RATIO = 0.2  # 20%

print("Configuration loaded")


Configuration loaded


In [2]:
# STEP 2.2: Define country and device profiles

PRIMARY_COUNTRY = "India"
FOREIGN_COUNTRIES = ["Russia", "China", "Brazil", "Germany"]

KNOWN_DEVICES = ["Chrome_Windows", "Chrome_Android"]
UNKNOWN_DEVICES = ["Firefox_Linux", "Unknown_Device"]

print("Country and device profiles defined")


Country and device profiles defined


In [4]:
# STEP 2.3: Generate users with baseline behavior

import random
from datetime import time

users = {}

for i in range(1, NUM_USERS + 1):
    user_id = f"U{str(i).zfill(3)}"
    
    # Each user has a preferred login hour window (normal behavior)
    start_hour = random.randint(8, 12)
    end_hour = start_hour + random.randint(6, 9)
    
    users[user_id] = {
        "primary_country": PRIMARY_COUNTRY,
        "preferred_login_hours": (start_hour, end_hour),
        "preferred_device": random.choice(KNOWN_DEVICES)
    }

# Display a few user profiles to verify
list(users.items())[:3]


[('U001',
  {'primary_country': 'India',
   'preferred_login_hours': (8, 14),
   'preferred_device': 'Chrome_Windows'}),
 ('U002',
  {'primary_country': 'India',
   'preferred_login_hours': (8, 14),
   'preferred_device': 'Chrome_Windows'}),
 ('U003',
  {'primary_country': 'India',
   'preferred_login_hours': (11, 19),
   'preferred_device': 'Chrome_Windows'})]

In [5]:
login_records = []
from datetime import datetime, timedelta

NUM_LOGINS = 500

for _ in range(NUM_LOGINS):
    user_id = random.choice(list(users.keys()))
    user = users[user_id]

    # Decide if this login is normal or abnormal
    is_suspicious = random.random() < 0.15  # 15% anomalies

    if not is_suspicious:
        login_hour = random.randint(
            user["preferred_login_hours"][0],
            user["preferred_login_hours"][1]
        )
        country = user["primary_country"]
        device = user["preferred_device"]
    else:
        login_hour = random.randint(0, 23)
        country = random.choice(["USA", "Germany", "Russia", "India"])
        device = random.choice(["Firefox_Linux", "Chrome_Windows", "Mobile_Android"])

    login_time = datetime.now() - timedelta(
        days=random.randint(0, 30),
        hours=login_hour
    )

    login_records.append({
        "user_id": user_id,
        "timestamp": login_time,
        "country": country,
        "device": device
    })


In [6]:
login_records[:5]


[{'user_id': 'U006',
  'timestamp': datetime.datetime(2025, 12, 11, 21, 11, 21, 783789),
  'country': 'India',
  'device': 'Chrome_Android'},
 {'user_id': 'U015',
  'timestamp': datetime.datetime(2025, 11, 25, 2, 11, 21, 783823),
  'country': 'India',
  'device': 'Chrome_Windows'},
 {'user_id': 'U004',
  'timestamp': datetime.datetime(2025, 11, 30, 22, 11, 21, 783836),
  'country': 'India',
  'device': 'Chrome_Windows'},
 {'user_id': 'U023',
  'timestamp': datetime.datetime(2025, 12, 15, 19, 11, 21, 783846),
  'country': 'India',
  'device': 'Chrome_Windows'},
 {'user_id': 'U016',
  'timestamp': datetime.datetime(2025, 12, 3, 20, 11, 21, 783856),
  'country': 'India',
  'device': 'Chrome_Android'}]

In [7]:
import pandas as pd

df_logins = pd.DataFrame(login_records)
df_logins.head()


Unnamed: 0,user_id,timestamp,country,device
0,U006,2025-12-11 21:11:21.783789,India,Chrome_Android
1,U015,2025-11-25 02:11:21.783823,India,Chrome_Windows
2,U004,2025-11-30 22:11:21.783836,India,Chrome_Windows
3,U023,2025-12-15 19:11:21.783846,India,Chrome_Windows
4,U016,2025-12-03 20:11:21.783856,India,Chrome_Android


In [8]:
df_logins["login_hour"] = df_logins["timestamp"].dt.hour
df_logins.head()


Unnamed: 0,user_id,timestamp,country,device,login_hour
0,U006,2025-12-11 21:11:21.783789,India,Chrome_Android,21
1,U015,2025-11-25 02:11:21.783823,India,Chrome_Windows,2
2,U004,2025-11-30 22:11:21.783836,India,Chrome_Windows,22
3,U023,2025-12-15 19:11:21.783846,India,Chrome_Windows,19
4,U016,2025-12-03 20:11:21.783856,India,Chrome_Android,20


In [11]:
def is_suspicious_login(row):
    user = users[row["user_id"]]
    start, end = user["preferred_login_hours"]

    score = 0

    # Rule 1: Login outside preferred hours
    if not (start <= row["login_hour"] <= end):
        score += 1

    # Rule 2: Login from new country
    if row["country"] != user["primary_country"]:
        score += 1

    # Rule 3: Login from new device
    if row["device"] != user["preferred_device"]:
        score += 1

    # Mark suspicious only if 2 or more rules fail
    return score >= 2


In [12]:
df_logins["is_suspicious"] = df_logins.apply(is_suspicious_login, axis=1)
df_logins["is_suspicious"].value_counts()


is_suspicious
False    428
True      72
Name: count, dtype: int64

In [None]:
df_logins.to_csv("../data/login_data.csv", index=False)
