In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from faker import Faker

In [None]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)
Faker.seed(42)

# Initialize Faker
fake = Faker()

# Define the columns for the DataFrame
columns = ["user_id", "name", "addresses", "email_address", "age", "kyc_status", "days_since_kyc_incomplete",
           "transaction_id", "transaction_amount", "transaction_date", "home_branch", "transaction_location",
           "transaction_method", "transaction_category", "transaction_merchant", "transaction_time",
           "average_expenditure", "comparison_with_avg_expenditure", "transaction_count_7_days", "fraud_indicator",
           "suspicion_indicator"]

data = []

# Generate the dataset
for user_id in range(1, 20001):
    name = f"name {user_id}"
    addresses = fake.address()
    email_address = fake.email()
    age = random.randint(18, 75)
    kyc_status = "incomplete" if random.random() < 0.1 else "complete"
    days_since_kyc_incomplete = np.random.randint(1, 365) if kyc_status == "incomplete" else 0
    transaction_id = f"transaction_id{random.randint(10000, 99999)}"
    transaction_amount = np.random.uniform(1000, 150000)
    transaction_date = fake.date_between(start_date='-1y', end_date='today').strftime('%Y-%m-%d')
    home_branch = fake.city()

    num_random_cities = 2
    transaction_location_choices = [home_branch] + [fake.city() for _ in range(num_random_cities)] + ["unknown"]
    transaction_location_probabilities = [0.8] + [0.1 / num_random_cities] * num_random_cities + [0.1]
    transaction_location = np.random.choice(transaction_location_choices, p=transaction_location_probabilities)

    if transaction_location != home_branch and transaction_location not in transaction_location_choices[1:]:
        transaction_location = "unknown"

    transaction_method = np.random.choice(["cash", "card", "online", "upi"])
    transaction_category = np.random.choice(["travel", "food", "loan", "recreation"])
    transaction_merchant = fake.company()

    transaction_time = fake.time_object().strftime('%H:%M:%S')

    average_expenditure = np.random.uniform(1000, 80000)
    comparison_with_avg_expenditure = transaction_amount - average_expenditure

    transaction_count_7_days = int(np.random.normal(loc=10, scale=5))
    transaction_count_7_days = max(1, min(30, transaction_count_7_days))

    # Adjusted conditions for fraud to create a better balance
    fraud_conditions = [
        comparison_with_avg_expenditure > 30000,
        transaction_amount > 110000,
        0 <= int(transaction_time.split(':')[0]) < 5,
        transaction_location not in [home_branch, "unknown"],
        transaction_count_7_days > 18
    ]

    # Mark transaction as fraudulent if at least two conditions are met
    fraud_indicator = 1 if sum(fraud_conditions) >= 2 else 0

    # New feature: Suspicion Indicator
    suspicion_indicator = 1 if any(fraud_conditions) else 0

    data.append([user_id, name, addresses, email_address, age, kyc_status, days_since_kyc_incomplete,
                 transaction_id, transaction_amount, transaction_date, home_branch, transaction_location,
                 transaction_method, transaction_category, transaction_merchant, transaction_time,
                 average_expenditure, comparison_with_avg_expenditure, transaction_count_7_days, fraud_indicator,
                 suspicion_indicator])

# Create DataFrame
df = pd.DataFrame(data, columns=columns)

# Define 50 pairs of users with the same addresses and email IDs
address_email_pairs = [(3, 15), (37, 3839)] + [(i, i + 1) for i in range(8000, 8200, 2)]

# Ensure specific pairs of user IDs have the same addresses and email IDs
for user_id1, user_id2 in address_email_pairs:
    email_pair = fake.email()
    address_pair = fake.address()
    df.loc[df['user_id'].isin([user_id1, user_id2]), ['email_address', 'addresses']] = email_pair, address_pair

# Define 48 pairs of users with the same transaction merchants
merchant_pairs = [(5, 17), (39, 3941)] + [(i, i + 1) for i in range(4000, 4200, 2)]

# Ensure specific pairs of user IDs have the same transaction merchants
for user_id1, user_id2 in merchant_pairs:
    merchant_pair = fake.company()
    df.loc[df['user_id'].isin([user_id1, user_id2]), 'transaction_merchant'] = merchant_pair

# Display the DataFrame with formatted names
print("Sample of generated data:")
for index, row in df.head().iterrows():
    print(f"User ID {row['user_id']} is having {row['name']}")

# Save DataFrame to the correct local data folder
df.to_csv('../data/TransactionDataset1.csv', index=False)

print("\nSuccessfully saved 'TransactionDataset1.csv' to the '../data/' folder.")

Sample of generated data:
User ID 1 is having name 1
User ID 2 is having name 2
User ID 3 is having name 3
User ID 4 is having name 4
User ID 5 is having name 5

Successfully saved 'TransactionDataset1.csv' to the '../data/' folder.
