In [20]:
### Regression Problem


In [21]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for generating synthetic names
fake = Faker()

# Set the seed for reproducibility
np.random.seed(50)

# Number of records to generate
num_records = 10000 

# Generate customer_id
customer_id = [i for i in range(1, num_records + 1)]

# Generate customer_name using Faker (unique names)
customer_name = [fake.name() for _ in range(num_records)]

# Generate random age (more realistic distribution: bell curve centered around 40)
age = np.random.normal(loc=40, scale=15, size=num_records).astype(int)
age = np.clip(age, 18, 70)  # Ensure ages are within the range of 18 to 70

# Generate random income_level (with skewed distribution to reflect market trends)
income_level = np.random.choice(['Low', 'Medium', 'High'], p=[0.4, 0.4, 0.2], size=num_records)

# Generate days_since_last_purchase (skewed distribution: most customers will have recent activity)
days_since_last_purchase = np.random.lognormal(mean=1, sigma=1, size=num_records).astype(int)
days_since_last_purchase = np.clip(days_since_last_purchase, 1, 365)

# Generate active_days (realistic spread across the year)
active_days = np.random.randint(30, 366, size=num_records)

# Generate total_number_of_purchases (skewed to simulate frequent buyers)
total_number_of_purchases = np.random.normal(loc=1000, scale=1500, size=num_records).astype(int)
total_number_of_purchases = np.clip(total_number_of_purchases, 100, 5000)

# Generate total_amount_spent (log-normal distribution for spending behavior)
total_amount_spent = np.random.lognormal(mean=10, sigma=1.5, size=num_records)
total_amount_spent = np.clip(total_amount_spent, 500, 50000)

# Calculate avg_amount_spent (ensure relationship with purchases and spending)
avg_amount_spent = total_amount_spent / total_number_of_purchases

# Generate bonus_amount_received as a function of total_purchases, spending, and recent activity
bonus_amount_received = (total_amount_spent * 0.05 / 100) + (total_number_of_purchases * 0.02)

# Generate additional features
days_since_first_purchase = np.random.randint(1, 1001, size=num_records)
purchase_frequency = np.random.uniform(0.5, 2.0, size=num_records)
last_purchase_amount = np.random.uniform(50, 5000, size=num_records)
seasonal_purchase_behavior = np.random.choice([0, 1], size=num_records)  # 1 if seasonal buyer
refunds = np.random.randint(0, 6, size=num_records)
location = np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_records)

# Introduce more realism by correlating features
days_since_last_purchase = np.clip(days_since_first_purchase - active_days, 1, 365)
total_amount_spent = np.where(seasonal_purchase_behavior == 1, 
                               total_amount_spent * np.random.uniform(1.2, 1.5), 
                               total_amount_spent)

# Inject some outliers for realism
outlier_indices = np.random.choice(num_records, size=int(0.01 * num_records), replace=False)
total_amount_spent[outlier_indices] *= 10  # Extreme spenders

# Create the dataframe with new features
synthetic_df = pd.DataFrame({
    'customer_id': customer_id,
    'customer_name': customer_name,
    'age': age,
    'income_level': income_level,
    'days_since_last_purchase': days_since_last_purchase,
    'active_days': active_days,
    'total_number_of_purchases': total_number_of_purchases,
    'total_amount_spent': total_amount_spent,
    'avg_amount_spent': avg_amount_spent,
    'days_since_first_purchase': days_since_first_purchase,
    'purchase_frequency': purchase_frequency,
    'last_purchase_amount': last_purchase_amount,
    'seasonal_purchase_behavior': seasonal_purchase_behavior,
    'refunds': refunds,
    'location': location,
    'bonus_amount_received': bonus_amount_received,
})

# Save to CSV
synthetic_df.to_csv('synthetic_customer_data_final.csv', index=False)

# Print summary
print("Dataset created with shape:", synthetic_df.shape)


Dataset created with shape: (10000, 16)


In [22]:
synthetic_df.head()

Unnamed: 0,customer_id,customer_name,age,income_level,days_since_last_purchase,active_days,total_number_of_purchases,total_amount_spent,avg_amount_spent,days_since_first_purchase,purchase_frequency,last_purchase_amount,seasonal_purchase_behavior,refunds,location,bonus_amount_received
0,1,Kyle Rogers,18,Medium,365,217,100,45652.568412,456.525684,877,0.706352,2987.613718,0,5,Rural,24.826284
1,2,Brian Schwartz,39,Low,365,258,2653,4277.094805,1.612173,911,1.480141,388.023592,0,0,Suburban,55.198547
2,3,Brooke Hoffman,30,Low,1,250,476,48111.691777,101.074983,96,1.678426,176.329616,0,0,Urban,33.575846
3,4,Thomas Allen,18,Medium,365,77,100,47829.067377,478.290674,702,1.511669,2325.794544,0,4,Urban,25.914534
4,5,Joshua Lopez,61,Medium,365,312,948,15231.434496,16.066914,836,0.73104,1578.717741,0,2,Urban,26.575717


In [23]:
synthetic_df['age'].value_counts().sum()

10000

In [24]:
synthetic_df['bonus_amount_received'].max()

125.0

In [25]:
synthetic_df.shape

(10000, 16)

In [26]:
synthetic_df['bonus_amount_received']


0       24.826284
1       55.198547
2       33.575846
3       25.914534
4       26.575717
          ...    
9995    27.000000
9996    67.000000
9997    73.760000
9998    77.510203
9999     7.970000
Name: bonus_amount_received, Length: 10000, dtype: float64