In [2]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 500

# Simulate features
age = np.random.randint(30, 80, size=n_samples)  # Ages between 30 and 80
gender = np.random.choice(['Male', 'Female'], size=n_samples)
cholesterol = np.random.normal(loc=200, scale=25, size=n_samples)  # Average cholesterol around 200 mg/dL
systolic_bp = np.random.normal(loc=120, scale=15, size=n_samples)  # Systolic blood pressure
diastolic_bp = np.random.normal(loc=80, scale=10, size=n_samples)   # Diastolic blood pressure
bmi = np.random.normal(loc=27, scale=4, size=n_samples)             # Average BMI
smoker = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])     # 30% smokers
diabetes = np.random.choice([0, 1], size=n_samples, p=[0.85, 0.15])   # 15% with diabetes

# Compute a synthetic risk score for heart disease.
# The formula below is arbitrary and serves to create a non-linear, interacting relationship between features.
risk_score = (
    0.03 * age +
    0.02 * cholesterol +
    0.04 * systolic_bp +
    0.05 * bmi +
    0.5 * smoker +
    0.7 * diabetes +
    np.random.normal(0, 1, n_samples)  # Adding some random noise
)

# Determine heart disease outcome by setting a threshold at the 70th percentile of the risk score
threshold = np.percentile(risk_score, 70)
heart_disease = (risk_score > threshold).astype(int)

# Create a DataFrame
data = pd.DataFrame({
    'age': age,
    'gender': gender,
    'cholesterol': cholesterol,
    'systolic_bp': systolic_bp,
    'diastolic_bp': diastolic_bp,
    'bmi': bmi,
    'smoker': smoker,
    'diabetes': diabetes,
    'heart_disease': heart_disease
})

# Introduce missing values to simulate real-world data challenges.
# For example, let's inject missing values in 5% of the 'cholesterol' and 'systolic_bp' entries.
for col in ['cholesterol', 'systolic_bp']:
    missing_indices = data.sample(frac=0.05, random_state=42).index
    data.loc[missing_indices, col] = np.nan

# Display the first few rows of the dataset
print(data.head())
data.to_csv('heart_disease.csv', index:False)

SyntaxError: invalid syntax (2126353973.py, line 57)