In [25]:
### Regression Problem


In [26]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for generating synthetic names
fake = Faker()

# Set the seed for reproducibility
np.random.seed(42)

# Number of records to generate
num_records = 10000 

# Generate customer_id
customer_id = [i for i in range(1, num_records + 1)]

# Generate customer_name using Faker (unique names)
customer_name = [fake.name() for _ in range(num_records)]

# Generate random age (more realistic distribution: bell curve centered around 40)
age = np.random.normal(loc=40, scale=15, size=num_records).astype(int)
age = np.clip(age, 18, 70)  # Ensure ages are within the range of 18 to 70

# Generate random income_level (with skewed distribution to reflect market trends)
income_level = np.random.choice(['Low', 'Medium', 'High'], p=[0.4, 0.4, 0.2], size=num_records)

# Generate days_since_last_purchase (skewed distribution: most customers will have recent activity)
days_since_last_purchase = np.random.lognormal(mean=1, sigma=1, size=num_records).astype(int)
days_since_last_purchase = np.clip(days_since_last_purchase, 1, 365)

# Generate active_days (realistic spread across the year)
active_days = np.random.randint(30, 366, size=num_records)

# Increase total_number_of_purchases (skewed to simulate the majority of customers buying frequently)
total_number_of_purchases = np.random.normal(loc=1000, scale=1500, size=num_records).astype(int)
total_number_of_purchases = np.clip(total_number_of_purchases, 100, 5000)

# Increase total_amount_spent to a larger range (log-normal distribution for spending behavior)
total_amount_spent = np.random.lognormal(mean=10, sigma=1.5, size=num_records)
total_amount_spent = np.clip(total_amount_spent, 500, 50000)

# Calculate avg_amount_spent (more realistic relationship with total_amount_spent and total_purchases)
avg_amount_spent = total_amount_spent / total_number_of_purchases

# Generate bonus_amount_received as a function of total_purchases, spending, and recent activity
bonus_amount_received = (total_amount_spent * 0.05 / 100) + (total_number_of_purchases * 0.02)  # Adds complexity

# Generate additional features
days_since_first_purchase = np.random.randint(1, 1001, size=num_records)  # Random days since first purchase
purchase_frequency = np.random.uniform(0.5, 2.0, size=num_records)  # Frequency of purchases per year
last_purchase_amount = np.random.uniform(50, 5000, size=num_records)  # Last purchase amount
seasonal_purchase_behavior = np.random.choice([0, 1], size=num_records)  # 1 if the customer buys seasonally
emails_received = np.random.randint(1, 21, size=num_records)  # Number of emails received
emails_opened = np.random.randint(0, 21, size=num_records)  # Number of emails opened
promo_codes_used = np.random.randint(0, 11, size=num_records)  # Number of promo codes used
refunds = np.random.randint(0, 6, size=num_records)  # Number of refunds requested
avg_discount = np.random.uniform(0, 30, size=num_records)  # Average discount percentage received
location = np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_records)  # Location of the customer

# Create the dataframe with new features
synthetic_df = pd.DataFrame({
    'customer_id': customer_id,
    'customer_name': customer_name,
    'age': age,
    'income_level': income_level,
    'days_since_last_purchase': days_since_last_purchase,
    'active_days': active_days,
    'total_number_of_purchases': total_number_of_purchases,
    'total_amount_spent': total_amount_spent,
    'avg_amount_spent': avg_amount_spent,
    'days_since_first_purchase': days_since_first_purchase,
    'purchase_frequency': purchase_frequency,
    'last_purchase_amount': last_purchase_amount,
    'seasonal_purchase_behavior': seasonal_purchase_behavior,
    'emails_received': emails_received,
    'emails_opened': emails_opened,
    'promo_codes_used': promo_codes_used,
    'refunds': refunds,
    'avg_discount': avg_discount,
    'location': location,
    'bonus_amount_received': bonus_amount_received,
})

# Save to CSV
synthetic_df.to_csv('synthetic_customer_data_with_more_features.csv', index=False)



In [27]:
synthetic_df.head()

Unnamed: 0,customer_id,customer_name,age,income_level,days_since_last_purchase,active_days,total_number_of_purchases,total_amount_spent,avg_amount_spent,days_since_first_purchase,purchase_frequency,last_purchase_amount,seasonal_purchase_behavior,emails_received,emails_opened,promo_codes_used,refunds,avg_discount,location,bonus_amount_received
0,1,Elizabeth Hart,47,Low,1,57,1546,4305.489853,2.784922,264,1.094361,2549.57896,0,14,18,5,3,14.644955,Suburban,33.072745
1,2,Andrew Smith,37,Low,1,133,100,50000.0,500.0,36,1.963092,260.602943,1,3,5,6,2,23.716789,Urban,27.0
2,3,Jean Wilson,49,Low,4,125,406,1007.248669,2.480908,107,1.9632,250.25908,1,5,17,4,4,6.621062,Rural,8.623624
3,4,Paula Mays,62,Low,1,222,997,13529.063371,13.569773,25,0.687468,2703.937022,0,15,11,1,3,4.677589,Rural,26.704532
4,5,Andrea Alexander,36,Medium,2,154,641,500.0,0.780031,424,1.219208,1798.340379,1,16,9,0,2,11.95185,Rural,13.07


In [28]:
synthetic_df['bonus_amount_received'].max()

125.0

In [29]:
synthetic_df.shape

(10000, 20)

In [30]:
synthetic_df['bonus_amount_received']


0       33.072745
1       27.000000
2        8.623624
3       26.704532
4       13.070000
          ...    
9995    59.831617
9996    80.256801
9997    54.312928
9998    45.832368
9999    71.407776
Name: bonus_amount_received, Length: 10000, dtype: float64