In [4]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()

# Function to calculate CLV based on features
def calculate_clv(age, income, account_balance, frequency_of_mortgage, frequency_of_loan, frequency_of_credit_card):
    base_clv = account_balance * 0.1
    age_factor = (80 - age) / 80  # Assume max age of 80 for normalization
    income_factor = income / 100000  # Normalize income assuming max income of 100000
    frequency_factor = (frequency_of_mortgage + frequency_of_loan + frequency_of_credit_card) / 15  # Normalize frequency per year (max 5 per category)
    return base_clv * age_factor * income_factor * (1 + frequency_factor)

# Generate synthetic data
def generate_synthetic_data(n_samples=1000):
    data = []

    for _ in range(n_samples):
        customer_id = np.random.randint(1, 1001)
        age = fake.random_int(min=18, max=80)
        gender = fake.random_element(elements=("M", "F"))
        income = fake.random_int(min=20000, max=100000)
        account_type = fake.random_element(elements=("Savings", "Checking", "Credit"))
        account_balance = fake.random_int(min=1000, max=100000)
        num_transactions = fake.random_int(min=5, max=50)
        transaction_dates = [fake.date_this_decade() for _ in range(num_transactions)]
        transaction_amounts = [fake.random_int(min=10, max=1000) for _ in range(num_transactions)]
        frequency_of_mortgage = fake.random_int(min=0, max=5)
        frequency_of_loan = fake.random_int(min=0, max=5)
        frequency_of_credit_card = fake.random_int(min=0, max=5)
        
        for i in range(num_transactions):
            clv = calculate_clv(age, income, account_balance, frequency_of_mortgage, frequency_of_loan, frequency_of_credit_card)
            data.append({
                "Customer ID": customer_id,
                "Age": age,
                "Gender": gender,
                "Income": income,
                "Account Type": account_type,
                "Account Balance": account_balance,
                "Transaction Date": transaction_dates[i],
                "Transaction Amount": transaction_amounts[i],
                "Frequency of Mortgage": frequency_of_mortgage,
                "Frequency of Loan": frequency_of_loan,
                "Frequency of Credit Card": frequency_of_credit_card,
                "CLV": clv
            })

    return pd.DataFrame(data)

# Generate dataset
dataset = generate_synthetic_data(n_samples=1000)

# Print the first few rows of the dataset
print(dataset.head())

# Save to CSV
dataset.to_csv("synthetic_clv.csv", index=False)


   Customer ID  Age Gender  Income Account Type  Account Balance  \
0          914   76      F   53312      Savings             8403   
1          914   76      F   53312      Savings             8403   
2          914   76      F   53312      Savings             8403   
3          914   76      F   53312      Savings             8403   
4          914   76      F   53312      Savings             8403   

  Transaction Date  Transaction Amount  Frequency of Mortgage  \
0       2022-12-22                 419                      0   
1       2023-05-01                 757                      0   
2       2022-12-07                 603                      0   
3       2024-01-10                 310                      0   
4       2022-05-23                 648                      0   

   Frequency of Loan  Frequency of Credit Card        CLV  
0                  2                         1  26.878844  
1                  2                         1  26.878844  
2                  2