In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Parameters for the dataset
n_patients = 451

# Re-seeding for reproducibility
np.random.seed(42)

# Regenerating categorical variables
sex = np.random.choice(['Homme', 'Femme'], size=n_patients)
smoking = np.random.choice(['Non', 'Occasionnel', 'Souvent'], size=n_patients)
weight = np.random.choice(['Sous-poids', 'Moyen', 'Surpoids'], size=n_patients)

# Creating numerical variables with proper distributions
taux_de_glucose = np.random.normal(70, 10, size=n_patients)  # Variable 1
pression_sanguine = taux_de_glucose + np.random.normal(0, 5, size=n_patients)  # Positively correlated
cholesterol_hdl = np.random.normal(50, 10, size=n_patients)  # Variable 3

# Standardizing the numerical variables
taux_de_glucose_z = zscore(taux_de_glucose)
pression_sanguine_z = zscore(pression_sanguine)
cholesterol_hdl_z = zscore(cholesterol_hdl)

# Calculating disease probabilities with proper correlations
p_maladie = (
    0.2 +
    0.1 * (sex == 'Femme') +
    0.4 * (smoking == 'Souvent') +
    0.3 * (weight == 'Surpoids') +
    0.3 * taux_de_glucose_z +
    0.3 * pression_sanguine_z -
    0.2 * cholesterol_hdl_z
)
p_maladie = np.clip(p_maladie, 0, 1)  # Ensuring probabilities are between 0 and 1

# Binary response for the disease
maladie = np.random.binomial(1, p_maladie)

# Creating the updated DataFrame
df_corrected = pd.DataFrame({
    'Sexe': sex,
    'Fumeur': smoking,
    'Poids': weight,
    'Taux_de_glucose': taux_de_glucose,
    'Pression_sanguine': pression_sanguine,
    'Cholesterol_HDL': cholesterol_hdl,
    'Maladie': maladie
})

# Saving to CSV
file_corrected_path = 'base_patients_corrigee_standardisee.csv'
df_corrected.to_csv(file_corrected_path, index=False)

print(f"File saved to: {file_corrected_path}")


File saved to: base_patients_corrigee_standardisee.csv
