In [18]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define possible values for each column
possible_diseases = [
    'Influenza', 'Common Cold', 'Eczema', 'Asthma', 'Dengue Fever',
    'Hypertension', 'Diabetes', 'Heart Disease', 'Stroke', 'Pneumonia',
    'Bronchitis', 'Chronic Kidney Disease', 'Liver Disease', 'Malaria', 'Tuberculosis'
]
possible_binary = ['Yes', 'No']
possible_genders = ['Male', 'Female']
possible_blood_pressure = ['Low', 'Normal', 'High']
possible_cholesterol = ['Low', 'Normal', 'High']

def generate_gender(disease):
    """
    Returns a gender based on disease-specific probabilities.
    """
    if disease in ['Influenza', 'Common Cold', 'Pneumonia', 'Bronchitis']:
        # These diseases affect both genders equally.
        return np.random.choice(['Male', 'Female'], p=[0.5, 0.5])
    elif disease in ['Heart Disease', 'Hypertension', 'Stroke', 'Chronic Kidney Disease']:
        # Slight male bias for cardiovascular and renal diseases.
        return np.random.choice(['Male', 'Female'], p=[0.55, 0.45])
    elif disease in ['Eczema', 'Asthma']:
        # Slight female bias due to autoimmune and allergy prevalence.
        return np.random.choice(['Male', 'Female'], p=[0.45, 0.55])
    elif disease in ['Dengue Fever', 'Malaria', 'Tuberculosis']:
        # Typically affect both genders equally in endemic areas.
        return np.random.choice(['Male', 'Female'], p=[0.5, 0.5])
    elif disease in ['Diabetes']:
        # Slightly higher prevalence in females globally, but almost equal.
        return np.random.choice(['Male', 'Female'], p=[0.48, 0.52])
    elif disease in ['Liver Disease']:
        # Higher prevalence in males due to lifestyle factors.
        return np.random.choice(['Male', 'Female'], p=[0.6, 0.4])
    else:
        # General population diseases
        return np.random.choice(['Male', 'Female'], p=[0.5, 0.5])

def generate_age(disease):
    if disease in ['Diabetes', 'Heart Disease', 'Hypertension', 'Stroke', 'Chronic Kidney Disease']:
        # Primarily affects older adults
        return max(0, int(np.random.normal(60, 10)))  # Mean 60, SD 10, ensure no negative age
    elif disease in ['Eczema', 'Asthma']:
        # Can affect all ages but more common in children or young adults
        age_range = range(10, 40)
        probs = np.array([0.7] + [0.3 / (len(age_range) - 1)] * (len(age_range) - 1))
        return int(np.random.choice(age_range, p=probs))
    elif disease in ['Influenza', 'Common Cold', 'Pneumonia', 'Bronchitis']:
        # All ages but skewed to younger children or elderly
        age_range = range(0, 90)
        probs = np.array([0.1 / 10] * 10 + [0.02 / 70] * 70 + [0.08 / 10] * 10)
        probs = probs / probs.sum()  # Normalize to ensure sum equals 1
        return int(np.random.choice(age_range, p=probs))
    elif disease in ['Malaria', 'Tuberculosis', 'Dengue Fever']:
        # Typically affect young to middle-aged individuals in endemic regions
        age_range = range(15, 60)
        probs = np.array([0.98 / len(age_range)] * len(age_range))
        probs = probs / probs.sum()  # Normalize to ensure sum equals 1
        return int(np.random.choice(age_range, p=probs))
    else:
        # General population diseases
        age_range = range(15, 60)
        probs = np.array([0.04 / len(age_range)] * len(age_range))
        probs = probs / probs.sum()  # Normalize to ensure sum equals 1
        return int(np.random.choice(age_range, p=probs))

def generate_features_for_disease(disease):
    if disease == 'Influenza':
        return {
            'Fever': 'Yes',
            'Cough': np.random.choice(['Yes', 'No'], p=[0.9, 0.1]),  # Commonly has cough
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.85, 0.15]),  # Fatigue is common
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.2, 0.8]),  # Rarely causes severe breathing issues
            'Blood Pressure': 'Normal',
            'Cholesterol Level': np.random.choice(['Normal', 'Low'], p=[0.8, 0.2])
        }
    elif disease == 'Diabetes':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),
            'Blood Pressure': np.random.choice(['Normal', 'High'], p=[0.3, 0.7]),
            'Cholesterol Level': 'High'
        }
    elif disease == 'Heart Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.7, 0.3]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Asthma':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.1, 0.9]),
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': 'Low',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Hypertension':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Blood Pressure': 'High',
            'Cholesterol Level': np.random.choice(['Normal', 'High'], p=[0.4, 0.6])
        }
    elif disease == 'Pneumonia':
        return {
            'Fever': 'Yes',
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.8, 0.2]),
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': np.random.choice(['Normal', 'Low'], p=[0.6, 0.4]),
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Malaria':
        return {
            'Fever': 'Yes',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),
            'Blood Pressure': 'Low',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Tuberculosis':
        return {
            'Fever': 'Yes',
            'Cough': 'Yes',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.8, 0.2]),
            'Blood Pressure': 'Low',
            'Cholesterol Level': np.random.choice(['Normal', 'Low'], p=[0.7, 0.3])
        }
    elif disease == 'Common Cold':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Difficulty Breathing': 'No',
            'Blood Pressure': 'Normal',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Chronic Kidney Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Liver Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': 'No',
            'Blood Pressure': np.random.choice(['Normal', 'Low'], p=[0.7, 0.3]),
            'Cholesterol Level': np.random.choice(['Normal', 'High'], p=[0.6, 0.4])
        }
    elif disease == 'Stroke':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.8, 0.2]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Bronchitis':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),  # Fever is not always present
            'Cough': 'Yes',  # Persistent cough is the hallmark symptom
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.6, 0.4]),
            'Blood Pressure': 'Normal',  # Not commonly associated with blood pressure issues
            'Cholesterol Level': 'Normal'  # Cholesterol is typically unaffected
        }
    elif disease == 'Dengue Fever':
        return {
            'Fever': 'Yes',  # High fever is a key symptom
            'Cough': 'No',  # Cough is rare
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.2, 0.8]),  # Severe cases may have breathing issues
            'Blood Pressure': 'Low',  # Low blood pressure may occur due to shock in severe cases
            'Cholesterol Level': np.random.choice(['Normal', 'Low'], p=[0.8, 0.2])  # Slight cholesterol impact in acute phases
        }
    elif disease == 'Eczema':
        return {
            'Fever': 'No',  # Eczema is not typically associated with fever
            'Cough': 'No',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),  # Fatigue is uncommon but possible in severe cases
            'Difficulty Breathing': 'No',  # Breathing issues are not related to eczema
            'Blood Pressure': 'Normal',  # Not related to blood pressure
            'Cholesterol Level': 'Normal'  # Cholesterol is typically unaffected
        }


# Generate synthetic data
num_samples = 23063
synthetic_data = []

for _ in range(num_samples):
    disease = np.random.choice(possible_diseases)
    features = generate_features_for_disease(disease)
    synthetic_data.append({
        'Disease': disease,
        **features,
        'Age': generate_age(disease),
        'Gender': generate_gender(disease),  # Slight bias to Female
        'is_labeled': np.random.choice([1, 0], p=[0.7, 0.3])  # 70% labeled, 30% unlabeled
    })

# Convert to DataFrame
synthetic_data = pd.DataFrame(synthetic_data)

# Save to a CSV file
output_path = "../data/raw/logical_synthetic_dataset_23063.csv"
synthetic_data.to_csv(output_path, index=False)

print(f"Logical dataset with {num_samples} samples generated and saved to {output_path}")


Logical dataset with 23063 samples generated and saved to ../data/raw/logical_synthetic_dataset_23063.csv


In [19]:
synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23063 entries, 0 to 23062
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               23063 non-null  object
 1   Fever                 23063 non-null  object
 2   Cough                 23063 non-null  object
 3   Fatigue               23063 non-null  object
 4   Difficulty Breathing  23063 non-null  object
 5   Blood Pressure        23063 non-null  object
 6   Cholesterol Level     23063 non-null  object
 7   Age                   23063 non-null  int64 
 8   Gender                23063 non-null  object
 9   is_labeled            23063 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 1.8+ MB
