In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate 10000 samples with 30 real-world feature names
num_samples = 10000
num_features = 30

# Define real-world feature names (finance, health, e-commerce, etc.)
feature_names = [
    "Age", "Income", "Credit_Score", "Loan_Amount", "Loan_Term", "Interest_Rate",
    "Monthly_Installment", "Debt_to_Income_Ratio", "Employment_Length", "Savings_Amount",
    "Stock_Investment", "Mutual_Fund_Investment", "Real_Estate_Investment", "Bitcoin_Holdings",
    "Transaction_Frequency", "Online_Purchase_Amount", "Mobile_Usage", "Subscription_Expenses",
    "Insurance_Premium", "Hospital_Visits", "BMI", "Blood_Pressure", "Cholesterol_Level",
    "Exercise_Hours_Per_Week", "Caloric_Intake", "Sleep_Hours", "Social_Media_Usage",
    "Streaming_Subscriptions", "Gaming_Hours", "Work_Hours_Per_Week"
]

# Generate random data for each feature with realistic distributions
data = {
    "Age": np.random.randint(18, 70, num_samples),
    "Income": np.random.randint(20000, 150000, num_samples),
    "Credit_Score": np.random.randint(300, 850, num_samples),
    "Loan_Amount": np.random.randint(5000, 500000, num_samples),
    "Loan_Term": np.random.choice([12, 24, 36, 48, 60, 72, 84], num_samples),
    "Interest_Rate": np.random.uniform(1.5, 10.0, num_samples),
    "Monthly_Installment": np.random.randint(100, 5000, num_samples),
    "Debt_to_Income_Ratio": np.random.uniform(0.1, 0.5, num_samples),
    "Employment_Length": np.random.randint(0, 40, num_samples),
    "Savings_Amount": np.random.randint(1000, 500000, num_samples),
    "Stock_Investment": np.random.randint(1000, 100000, num_samples),
    "Mutual_Fund_Investment": np.random.randint(1000, 50000, num_samples),
    "Real_Estate_Investment": np.random.randint(0, 5, num_samples),
    "Bitcoin_Holdings": np.random.uniform(0, 10, num_samples),
    "Transaction_Frequency": np.random.randint(1, 100, num_samples),
    "Online_Purchase_Amount": np.random.randint(10, 5000, num_samples),
    "Mobile_Usage": np.random.randint(1, 10, num_samples),
    "Subscription_Expenses": np.random.randint(5, 300, num_samples),
    "Insurance_Premium": np.random.randint(200, 5000, num_samples),
    "Hospital_Visits": np.random.randint(0, 10, num_samples),
    "BMI": np.random.uniform(18, 35, num_samples),
    "Blood_Pressure": np.random.randint(90, 180, num_samples),
    "Cholesterol_Level": np.random.randint(120, 300, num_samples),
    "Exercise_Hours_Per_Week": np.random.uniform(0, 10, num_samples),
    "Caloric_Intake": np.random.randint(1500, 4000, num_samples),
    "Sleep_Hours": np.random.uniform(4, 10, num_samples),
    "Social_Media_Usage": np.random.randint(0, 10, num_samples),
    "Streaming_Subscriptions": np.random.randint(0, 5, num_samples),
    "Gaming_Hours": np.random.uniform(0, 20, num_samples),
    "Work_Hours_Per_Week": np.random.randint(10, 80, num_samples)
}

# Convert to DataFrame
df = pd.DataFrame(data)


# 1. Loan Default Prediction (Classification: 0 = No, 1 = Yes)
df["Default_Status"] = np.where(
    (df["Credit_Score"] < 600) & (df["Debt_to_Income_Ratio"] > 0.35), 1, 0
)

# 2. Customer Churn Prediction (Classification: 0 = No, 1 = Yes)
df["Churn_Status"] = np.where(
    (df["Streaming_Subscriptions"] > 3) & (df["Subscription_Expenses"] > 150) & (df["Social_Media_Usage"] < 2), 1, 0
)

# 3. Predict Monthly Spending Habits (Regression: Continuous Value)
df["Future_Spending"] = df["Online_Purchase_Amount"] * np.random.uniform(0.8, 1.5, len(df))

df.to_csv('my_dataset.csv', index=False)
