In [2]:
from tqdm import tqdm
from tqdm import notebook

import random
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from faker import Faker

In [3]:
fake = Faker()

# Параметры генерации данных
num_clients = 10000
max_transactions_per_client = 150
max_logins_per_client = 100
max_payments_per_client = 50
max_activities_per_client = 100

def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

start_date = datetime.now() - timedelta(days=365)
end_date = datetime.now()

In [5]:
# Генерация данных для клиентских логинов и активности
login_data = []
activity_data = []
for client_id in notebook.tqdm(range(1, num_clients + 1), desc='login+activity'):
    num_logins = random.randint(1, max_logins_per_client)
    num_activities = random.randint(1, max_activities_per_client)
    for _ in range(num_logins):
        login_date = random_date(start_date, end_date)
        login_data.append({
            'client_id': client_id,
            'login_date': login_date,
            'ip_address': fake.ipv4(),
            'location': f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}",
            'device': fake.user_agent()
        })
    for _ in range(num_activities):
        activity_date = random_date(start_date, end_date)
        activity_data.append({
            'client_id': client_id,
            'activity_date': activity_date,
            'activity_type': np.random.choice(['view_account', 'transfer_funds', 'pay_bill', 'login', 'logout']),
            'activity_location': fake.uri_path(),
            'ip_address': fake.ipv4(),
            'device': fake.user_agent()
        })

login_df = pd.DataFrame(login_data)
login_df.to_csv('client_logins.csv', index=False)

activity_df = pd.DataFrame(activity_data)
activity_df.to_csv('client_activities.csv', index=False)

clients:   0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:42<?, ?it/s]


In [10]:
# Генерация данных для банковских транзакций
transaction_data = []
for client_id in notebook.tqdm(range(1, num_clients + 1), desc='transactions'):
    num_transactions = random.randint(1, max_transactions_per_client)
    for _ in range(num_transactions):
        transaction_date = random_date(start_date, end_date)
        currency = np.random.choice(['USD', 'RUB'])
        amount = round(np.random.uniform(100, 10000), 2) if currency == 'USD' else round(np.random.uniform(9000, 900000), 2)
        
        # Добавление аномально больших сумм
        if random.random() < 0.01:
            amount *= 10

        transaction_data.append({
            'client_id': client_id,
            'transaction_id': np.random.randint(1000, 10000),
            'transaction_date': transaction_date,
            'transaction_type': np.random.choice(['deposit', 'withdrawal', 'transfer']),
            'account_number': fake.iban(),
            'currency': currency,
            'amount': amount
        })

transactions:   0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
# Генерация данных для платежей
payment_data = []
for client_id in notebook.tqdm(range(1, num_clients + 1), desc='payments'):
    num_payments = random.randint(1, max_payments_per_client)
    for _ in range(num_payments):
        payment_date = random_date(start_date, end_date)
        currency = np.random.choice(['USD', 'RUB'])
        amount = round(np.random.uniform(100, 10000), 2) if currency == 'USD' else round(np.random.uniform(9000, 900000), 2)
        payment_data.append({
            'client_id': client_id,
            'payment_id': np.random.randint(1000, 10000),
            'payment_date': payment_date,
            'currency': currency,
            'amount': amount,
            'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet'])
        })

payments:   0%|          | 0/10000 [00:00<?, ?it/s]

In [15]:
# Генерация данных о клиентах
client_data = []
for client_id in notebook.tqdm(range(1, num_clients + 1), desc='clients'):
    client_data.append({
        'client_id': client_id,
        'client_name': fake.name(),
        'client_email': fake.email(),
        'client_phone': fake.phone_number(),
        'client_address': fake.address()
    })

client_df = pd.DataFrame(client_data)
client_df.to_csv('clients.csv', index=False)


clients:   0%|          | 0/10000 [00:00<?, ?it/s]

In [18]:
# Внедрение аномалий и корреляций
anomalous_clients_many_transactions = random.sample(range(1, num_clients + 1), 4)
anomalous_clients_large_transactions = random.sample(range(1, num_clients + 1), 4)

# Добавление аномалий для 60% клиентов из anomalous_clients_many_transactions
for client_id in notebook.tqdm(anomalous_clients_many_transactions, desc='anomalous_clients_many_transactions 60%'):
    if random.random() < 0.6:
        for _ in range(10):
            login_data.append({
                'client_id': client_id,
                'login_date': random_date(start_date, end_date),
                'ip_address': fake.ipv4(),
                'location': f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}",
                'device': fake.user_agent()
            })
            activity_data.append({
                'client_id': client_id,
                'activity_date': random_date(start_date, end_date),
                'activity_type': np.random.choice(['view_account', 'transfer_funds', 'pay_bill', 'login', 'logout']),
                'activity_location': fake.uri_path(),
                'ip_address': fake.ipv4(),
                'device': fake.user_agent()
            })
            payment_data.append({
                'client_id': client_id,
                'payment_id': np.random.randint(1000, 10000),
                'payment_date': random_date(start_date, end_date),
                'currency': np.random.choice(['USD', 'RUB']),
                'amount': round(np.random.uniform(100, 10000), 2),
                'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet'])
            })

# Добавление аномалий для 30% клиентов из anomalous_clients_many_transactions (все виды аномалий)
for client_id in notebook.tqdm(anomalous_clients_many_transactions, desc='anomalous_clients_many_transactions 30%'):
    if random.random() < 0.3:
        for _ in range(10):
            transaction_date = random_date(start_date, end_date)
            currency = np.random.choice(['USD', 'RUB'])
            amount = round(np.random.uniform(100, 10000), 2) if currency == 'USD' else round(np.random.uniform(9000, 900000), 2)
            transaction_data.append({
                'client_id': client_id,
                'transaction_id': np.random.randint(1000, 10000),
                'transaction_date': transaction_date,
                'transaction_type': np.random.choice(['deposit', 'withdrawal', 'transfer']),
                'account_number': fake.iban(),
                'currency': currency,
                'amount': amount
            })
            login_data.append({
                'client_id': client_id,
                'login_date': random_date(start_date, end_date),
                'ip_address': fake.ipv4(),
                'location': f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}",
                'device': fake.user_agent()
            })
            activity_data.append({
                'client_id': client_id,
                'activity_date': random_date(start_date, end_date),
                'activity_type': np.random.choice(['view_account', 'transfer_funds', 'pay_bill', 'login', 'logout']),
                'activity_location': fake.uri_path(),
                'ip_address': fake.ipv4(),
                'device': fake.user_agent()
            })
            payment_data.append({
                'client_id': client_id,
                'payment_id': np.random.randint(1000, 10000),
                'payment_date': random_date(start_date, end_date),
                'currency': np.random.choice(['USD', 'RUB']),
                'amount': round(np.random.uniform(100, 10000), 2),
                'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet'])
            })

# Добавление аномалий для 60% клиентов из anomalous_clients_large_transactions
for client_id in notebook.tqdm(anomalous_clients_large_transactions, desc='anomalous_clients_large_transactions 60%'):
    if random.random() < 0.6:
        for _ in range(10):
            payment_data.append({
                'client_id': client_id,
                'payment_id': np.random.randint(1000, 10000),
                'payment_date': random_date(start_date, end_date),
                'currency': np.random.choice(['USD', 'RUB']),
                'amount': round(np.random.uniform(100, 10000), 2),
                'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet'])
            })

# Добавление аномалий для 30% клиентов из anomalous_clients_large_transactions (все виды аномалий)
for client_id in notebook.tqdm(anomalous_clients_large_transactions, desc='anomalous_clients_large_transactions 30%'):
    if random.random() < 0.3:
        for _ in range(10):
            transaction_date = random_date(start_date, end_date)
            currency = np.random.choice(['USD', 'RUB'])
            amount = round(np.random.uniform(100, 10000), 2) if currency == 'USD' else round(np.random.uniform(9000, 900000), 2)
            transaction_data.append({
                'client_id': client_id,
                'transaction_id': np.random.randint(1000, 10000),
                'transaction_date': transaction_date,
                'transaction_type': np.random.choice(['deposit', 'withdrawal', 'transfer']),
                'account_number': fake.iban(),
                'currency': currency,
                'amount': amount
            })
            login_data.append({
                'client_id': client_id,
                'login_date': random_date(start_date, end_date),
                'ip_address': fake.ipv4(),
                'location': f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}",
                'device': fake.user_agent()
            })
            activity_data.append({
                'client_id': client_id,
                'activity_date': random_date(start_date, end_date),
                'activity_type': np.random.choice(['view_account', 'transfer_funds', 'pay_bill', 'login', 'logout']),
                'activity_location': fake.uri_path(),
                'ip_address': fake.ipv4(),
                'device': fake.user_agent()
            })
            payment_data.append({
                'client_id': client_id,
                'payment_id': np.random.randint(1000, 10000),
                'payment_date': random_date(start_date, end_date),
                'currency': np.random.choice(['USD', 'RUB']),
                'amount': round(np.random.uniform(100, 10000), 2),
                'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet'])
                
            })



anomalous_clients_many_transactions 60%:   0%|          | 0/4 [00:00<?, ?it/s]

anomalous_clients_many_transactions 30%:   0%|          | 0/4 [00:00<?, ?it/s]

anomalous_clients_large_transactions 60%:   0%|          | 0/4 [00:00<?, ?it/s]

anomalous_clients_large_transactions 30%:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
# Запись данных в CSV файлы
transaction_df = pd.DataFrame(transaction_data)
transaction_df.to_csv('bank_transactions.csv', index=False)

login_df = pd.DataFrame(login_data)
login_df.to_csv('client_logins.csv', index=False)

activity_df = pd.DataFrame(activity_data)
activity_df.to_csv('client_activities.csv', index=False)

payment_df = pd.DataFrame(payment_data)
payment_df.to_csv('payments.csv', index=False)

client_df = pd.DataFrame(client_data)
client_df.to_csv('clients.csv', index=False)

In [21]:
print(login_df.shape)
login_df.head()

(502180, 5)


Unnamed: 0,client_id,login_date,ip_address,location,device
0,1,2024-08-09 21:43:47.135499,202.77.22.202,"22.736897930412255, 32.737204348966515",Opera/9.93.(X11; Linux i686; eu-ES) Presto/2.9...
1,2,2024-03-27 21:29:24.135499,32.36.55.122,"-84.11772188863924, -155.01961905217263",Mozilla/5.0 (Windows CE) AppleWebKit/531.1 (KH...
2,2,2024-02-06 08:26:57.135499,192.53.23.212,"-67.2426172103398, -62.82277053489126",Opera/8.67.(Windows NT 5.01; tl-PH) Presto/2.9...
3,2,2024-01-15 04:40:16.135499,60.132.2.104,"26.938505424013158, 88.87684783275586",Mozilla/5.0 (X11; Linux i686) AppleWebKit/533....
4,2,2023-11-29 16:48:19.135499,79.146.57.212,"-1.4098761134741409, -176.7064655429866",Mozilla/5.0 (Macintosh; PPC Mac OS X 10_6_5; r...


In [22]:
print(activity_df.shape)
activity_df.head()

(506571, 6)


Unnamed: 0,client_id,activity_date,activity_type,activity_location,ip_address,device
0,1,2024-07-30 04:36:57.135499,pay_bill,category,95.14.190.243,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...
1,1,2024-04-10 00:49:05.135499,logout,posts/list,30.239.6.15,Mozilla/5.0 (X11; Linux i686; rv:1.9.5.20) Gec...
2,1,2024-05-25 11:12:45.135499,login,category,17.82.192.111,Mozilla/5.0 (iPod; U; CPU iPhone OS 4_2 like M...
3,1,2024-04-15 19:02:01.135499,logout,main/tag,214.36.72.146,Mozilla/5.0 (Windows 95) AppleWebKit/535.0 (KH...
4,1,2024-08-24 06:51:04.135499,login,category,156.203.33.41,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...


In [23]:
print(payment_df.shape)
payment_df.head()

(254821, 6)


Unnamed: 0,client_id,payment_id,payment_date,currency,amount,payment_method
0,1,2483,2024-04-25 18:45:19.135499,USD,8955.39,debit_card
1,1,9567,2024-05-28 13:16:11.135499,USD,9771.89,credit_card
2,1,9056,2024-08-31 01:27:16.135499,USD,432.3,credit_card
3,1,5187,2023-12-30 23:14:46.135499,RUB,392672.49,credit_card
4,1,7514,2024-10-12 15:51:25.135499,USD,6144.25,debit_card


In [24]:
print(client_df.shape)
client_df.head()

(10000, 5)


Unnamed: 0,client_id,client_name,client_email,client_phone,client_address
0,1,Sarah Davis,scasey@example.org,+1-670-744-6515x1485,"035 Amy Valleys Apt. 300\nJohntown, DC 07282"
1,2,Alyssa Howard,hinestiffany@example.net,(856)958-6226x56298,"245 Sean Course\nBeardtown, NC 63239"
2,3,Dylan Suarez,williambuchanan@example.net,(585)371-7943x8922,"83254 Ryan Station Suite 722\nNew Stephanie, A..."
3,4,Tracy Mays,butlerbobby@example.com,001-202-909-9529,"96938 Wells Valleys Apt. 639\nAlexischester, A..."
4,5,Miss Meghan Reeves,victoriarobinson@example.com,001-833-496-5246x9496,"58404 Jennifer Lock\nKristinaborough, AL 94390"


In [25]:
print(transaction_df.shape)
transaction_df.head()

(755424, 7)


Unnamed: 0,client_id,transaction_id,transaction_date,transaction_type,account_number,currency,amount
0,1,6143,2024-07-28 03:16:24.135499,transfer,GB75NSZZ74355879443453,RUB,605902.51
1,1,4352,2024-02-10 12:11:06.135499,transfer,GB59NBQQ73345913845886,RUB,349197.65
2,1,4575,2024-07-29 10:26:38.135499,transfer,GB37JYJV82917017130705,RUB,814547.04
3,1,3123,2024-04-14 17:45:23.135499,transfer,GB10RKCM55522987869903,RUB,354757.28
4,1,7978,2023-11-30 08:52:08.135499,withdrawal,GB74AXNF03605576664447,RUB,677125.32


In [16]:
# Генерация данных в потоке
import random
import time
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker

fake = Faker()

# Параметры генерации данных
max_logins_per_client = 10
max_activities_per_client = 10
max_records_per_file = 1000

# Время между генерацией новых данных (в минутах)
sleep_time = 60  # 60 секунд (1 минута)

def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))


start_date = datetime.now() - timedelta(days=365)
end_date = datetime.now()

def generate_login_data(client_id):
    login_date = random_date(start_date, end_date)
    return {
        'client_id': client_id,
        'login_date': login_date,
        'ip_address': fake.ipv4(),
        'location': f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}",
        'device': fake.user_agent()
    }

def generate_activity_data(client_id):
    activity_date = random_date(start_date, end_date)
    return {
        'client_id': client_id,
        'activity_date': activity_date,
        'activity_type': random.choice(['view_account', 'transfer_funds', 'pay_bill', 'login', 'logout']),
        'activity_location': fake.uri_path(),
        'ip_address': fake.ipv4(),
        'device': fake.user_agent()
    }

def generate_transaction_data(client_id, transaction_type):
    transaction_date = random_date(start_date, end_date)
    currency = random.choice(['USD', 'RUB'])
    amount = round(random.uniform(100, 10000), 2) if currency == 'USD' else round(random.uniform(9000, 900000), 2)
    return {
        'client_id': client_id,
        'transaction_id': random.randint(1000, 10000),
        'transaction_date': transaction_date,
        'transaction_type': transaction_type,
        'account_number': fake.iban(),
        'currency': currency,
        'amount': amount
    }

def generate_payment_data(client_id, transaction_id):
    payment_date = random_date(start_date, end_date)
    currency = random.choice(['USD', 'RUB'])
    amount = round(random.uniform(100, 10000), 2) if currency == 'USD' else round(random.uniform(9000, 900000), 2)
    return {
        'client_id': client_id,
        'payment_id': random.randint(1000, 10000),
        'payment_date': payment_date,
        'currency': currency,
        'amount': amount,
        'payment_method': random.choice(['credit_card', 'debit_card', 'bank_transfer', 'e_wallet']),
        'transaction_id': transaction_id  # Связь с транзакцией
    }

def generate_anomalous_data(client_id):
    login_data = generate_login_data(client_id)
    login_data['ip_address'] = fake.ipv4()
    login_data['location'] = f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}"
    login_records.append(login_data)

    activity_data = generate_activity_data(client_id)
    activity_data['ip_address'] = fake.ipv4()
    activity_data['location'] = f"{random.uniform(-90, 90)}, {random.uniform(-180, 180)}"
    activity_records.append(activity_data)

    transaction_data = generate_transaction_data(client_id, 'payment')
    transaction_data['amount'] *= 10
    transaction_records.append(transaction_data)

    payment_data = generate_payment_data(client_id, transaction_data['transaction_id'])
    payment_data['amount'] *= 10
    payment_records.append(payment_data)

while True:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    login_records = []
    activity_records = []
    payment_records = []
    transaction_records = []

    for _ in range(max_records_per_file):
        client_id = random.randint(1, 10000)

        if random.random() < 0.3:
            login_data = generate_login_data(client_id)
            login_records.append(login_data)

        if random.random() < 0.3:
            activity_data = generate_activity_data(client_id)
            activity_records.append(activity_data)

        # Генерация транзакций
        transaction_type = random.choice(['deposit', 'withdrawal', 'transfer', 'payment'])
        transaction_data = generate_transaction_data(client_id, transaction_type)
        transaction_records.append(transaction_data)

        # Генерация оплаты, если тип транзакции 'payment'
        if transaction_type == 'payment':
            payment_data = generate_payment_data(client_id, transaction_data['transaction_id'])
            payment_records.append(payment_data)

        if random.random() < 0.05:
            generate_anomalous_data(client_id)

    # Запись данных в новые файлы с временной меткой
    login_df = pd.DataFrame(login_records)
    login_df.to_csv(f'client_logins_{timestamp}.csv', index=False)

    activity_df = pd.DataFrame(activity_records)
    activity_df.to_csv(f'client_activities_{timestamp}.csv', index=False)

    payment_df = pd.DataFrame(payment_records)
    payment_df.to_csv(f'payments_{timestamp}.csv', index=False)

    transaction_df = pd.DataFrame(transaction_records)
    transaction_df.to_csv(f'bank_transactions_{timestamp}.csv', index=False)

    time.sleep(sleep_time)


KeyboardInterrupt: 

In [None]:
transaction_df = pd.DataFrame(transaction_data)
transaction_df.to_csv('bank_transactions.csv', index=False)

login_df = pd.DataFrame(login_data)
login_df.to_csv('client_logins.csv', index=False)

activity_df = pd.DataFrame(activity_data)
activity_df.to_csv('client_activities.csv', index=False)

payment_df = pd.DataFrame(payment_data)
payment_df.to_csv('payments.csv', index=False)

client_df = pd.DataFrame(client_data)
client_df.to_csv('clients.csv', index=False)