In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

n_samples = 500

# Core features
age = np.random.randint(18, 65, n_samples)
income = np.random.normal(50000, 15000, n_samples)
experience = age - 18 + np.random.normal(0, 3, n_samples)

gender = np.random.choice(['Male', 'Female'], n_samples)
city = np.random.choice(['New York', 'London', 'Berlin', 'Tokyo'], n_samples)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples)

# Target variable
purchased = np.random.choice([0, 1], n_samples, p=[0.6, 0.4])

# Irrelevant columns
customer_id = np.arange(10000, 10000 + n_samples)
random_noise = np.random.random(n_samples)

# Create DataFrame
df = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Income': income,
    'ExperienceYears': experience,
    'Gender': gender,
    'City': city,
    'Education': education,
    'RandomNoise': random_noise,
    'Purchased': purchased
})

# -----------------------------
# Inject Missing Values
# -----------------------------
for col in ['Age', 'Income', 'Education']:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

# -----------------------------
# Inject Outliers
# -----------------------------
df.loc[df.sample(5).index, 'Income'] = df['Income'] * 8
df.loc[df.sample(5).index, 'Age'] = 120

# -----------------------------
# Inject Duplicates
# -----------------------------
df = pd.concat([df, df.sample(10)], ignore_index=True)

# Shuffle rows
df = df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0,CustomerID,Age,Income,ExperienceYears,Gender,City,Education,RandomNoise,Purchased
0,10484,18.0,40595.494133,-2.20659,Female,Berlin,High School,0.033676,0
1,10466,33.0,36151.501308,15.992641,Female,New York,,0.324512,0
2,10022,19.0,,1.666401,Male,Tokyo,PhD,0.816779,1
3,10184,56.0,42042.482786,37.879526,Female,Tokyo,Master,0.70114,0
4,10487,21.0,41562.998362,4.827414,Male,New York,High School,0.557337,0


In [None]:
df.to_csv('synthetic_customer_data.csv', index=False)

: 