In [1]:
from faker import Faker
import pandas as pd
import random

In [2]:


# Initialize Faker and set seed for reproducibility
Faker.seed(42)
random.seed(42)
fake = Faker()

# --- Corrected Duplicate Phone Number Generation Logic ---

# Define total records and the ratio of duplicates desired
num_records = 20000
duplicates_ratio = 0.3

# Calculate the number of unique and duplicate phone numbers to generate
num_unique = int(num_records * (1 - duplicates_ratio))
num_duplicates = num_records - num_unique

# Generate a base of unique phone numbers
unique_phones = [fake.phone_number() for _ in range(num_unique)]

# Create the list of duplicates by randomly sampling from the unique list
# 'random.choices' is used as it allows for sampling with replacement
duplicate_phones = random.choices(unique_phones, k=num_duplicates)

# Combine the unique and duplicate lists to create the final list
final_phone_list = unique_phones + duplicate_phones

# Shuffle the list to ensure duplicates are randomly distributed
random.shuffle(final_phone_list)

# --- Generate Other User Data ---

names = [fake.name() for _ in range(num_records)]
emails = [fake.email() for _ in range(num_records)]
bank_account_numbers = [fake.credit_card_number(card_type='mastercard') for _ in range(num_records)]

# Create the final Pandas DataFrame
df = pd.DataFrame({
    'Name': names,
    'Phone Number': final_phone_list,
    'Email': emails,
    'Bank Account Number': bank_account_numbers,
})

# Print the length to confirm it's correct
print(f"Total records generated: {len(df)}")

# Save the dataset to the correct local data folder
df.to_csv('../data/synthetic_bank_data.csv', index=False)

print("\nSuccessfully saved 'synthetic_bank_data.csv' to the '../data/' folder.")

Total records generated: 20000

Successfully saved 'synthetic_bank_data.csv' to the '../data/' folder.
