In [14]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define possible values for each column
possible_diseases = [
    'Influenza', 'Common Cold', 'Eczema', 'Asthma', 'Dengue Fever',
    'Hypertension', 'Diabetes', 'Heart Disease', 'Stroke', 'Pneumonia',
    'Bronchitis', 'Chronic Kidney Disease', 'Liver Disease', 'Malaria', 'Tuberculosis'
]
possible_binary = ['Yes', 'No']
possible_genders = ['Male', 'Female']
possible_blood_pressure = ['Low', 'Normal', 'High']
possible_cholesterol = ['Low', 'Normal', 'High']

# Define logical rules for disease predictions
def generate_features_for_disease(disease):
    if disease == 'Influenza':
        return {
            'Fever': 'Yes',
            'Cough': np.random.choice(['Yes', 'No'], p=[0.7, 0.3]),
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.8, 0.2]),
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Blood Pressure': np.random.choice(['Normal', 'High'], p=[0.6, 0.4]),
            'Cholesterol Level': np.random.choice(['Normal', 'Low'], p=[0.8, 0.2])
        }
    elif disease == 'Common Cold':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.6, 0.4]),
            'Difficulty Breathing': 'No',
            'Blood Pressure': 'Normal',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Eczema':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.2, 0.8]),
            'Difficulty Breathing': 'No',
            'Blood Pressure': 'Normal',
            'Cholesterol Level': 'Low'
        }
    elif disease == 'Asthma':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.2, 0.8]),
            'Cough': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Fatigue': 'Yes',
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': np.random.choice(['Normal', 'Low'], p=[0.7, 0.3]),
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Dengue Fever':
        return {
            'Fever': 'Yes',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.4, 0.6]),
            'Blood Pressure': 'Low',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Hypertension':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.6, 0.4]),
            'Blood Pressure': 'High',
            'Cholesterol Level': np.random.choice(['Normal', 'High'], p=[0.4, 0.6])
        }
    elif disease == 'Diabetes':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Blood Pressure': np.random.choice(['Normal', 'High'], p=[0.5, 0.5]),
            'Cholesterol Level': 'High'
        }
    elif disease == 'Heart Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.7, 0.3]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Stroke':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.8, 0.2]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Pneumonia':
        return {
            'Fever': 'Yes',
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.6, 0.4]),
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': np.random.choice(['Normal', 'Low'], p=[0.7, 0.3]),
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Bronchitis':
        return {
            'Fever': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Cough': 'Yes',
            'Fatigue': np.random.choice(['Yes', 'No'], p=[0.5, 0.5]),
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': 'Normal',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Chronic Kidney Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.6, 0.4]),
            'Blood Pressure': 'High',
            'Cholesterol Level': 'High'
        }
    elif disease == 'Liver Disease':
        return {
            'Fever': 'No',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': 'No',
            'Blood Pressure': np.random.choice(['Normal', 'Low'], p=[0.8, 0.2]),
            'Cholesterol Level': np.random.choice(['Normal', 'High'], p=[0.7, 0.3])
        }
    elif disease == 'Malaria':
        return {
            'Fever': 'Yes',
            'Cough': 'No',
            'Fatigue': 'Yes',
            'Difficulty Breathing': np.random.choice(['Yes', 'No'], p=[0.3, 0.7]),
            'Blood Pressure': 'Low',
            'Cholesterol Level': 'Normal'
        }
    elif disease == 'Tuberculosis':
        return {
            'Fever': 'Yes',
            'Cough': 'Yes',
            'Fatigue': 'Yes',
            'Difficulty Breathing': 'Yes',
            'Blood Pressure': 'Low',
            'Cholesterol Level': 'Normal'
        }
    else:
        return {
            'Fever': np.random.choice(possible_binary),
            'Cough': np.random.choice(possible_binary),
            'Fatigue': np.random.choice(possible_binary),
            'Difficulty Breathing': np.random.choice(possible_binary),
            'Blood Pressure': np.random.choice(possible_blood_pressure),
            'Cholesterol Level': np.random.choice(possible_cholesterol)
        }

# Generate synthetic data
num_samples = 1900
synthetic_data = []

for _ in range(num_samples):
    disease = np.random.choice(possible_diseases)
    features = generate_features_for_disease(disease)
    synthetic_data.append({
        'Disease': disease,
        **features,
        'Age': np.random.randint(0, 100),
        'Gender': np.random.choice(possible_genders),
        'is_labeled': np.random.choice([1, 0], p=[0.7, 0.3])  # 70% labeled, 30% unlabeled
    })

# Convert to DataFrame
synthetic_data = pd.DataFrame(synthetic_data)

# Save to a CSV file
output_path = "../data/raw/logical_synthetic_dataset_1500.csv"
synthetic_data.to_csv(output_path, index=False)

print(f"Logical dataset with 1500 samples generated and saved to {output_path}")


Logical dataset with 1500 samples generated and saved to ../data/raw/logical_synthetic_dataset_1500.csv


In [15]:
synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               1900 non-null   object
 1   Fever                 1900 non-null   object
 2   Cough                 1900 non-null   object
 3   Fatigue               1900 non-null   object
 4   Difficulty Breathing  1900 non-null   object
 5   Blood Pressure        1900 non-null   object
 6   Cholesterol Level     1900 non-null   object
 7   Age                   1900 non-null   int64 
 8   Gender                1900 non-null   object
 9   is_labeled            1900 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 148.6+ KB
