### Import dependencies

In [8]:
import pandas as pd
import numpy as np
import random

### Load the data for disease_symptom_df

In [9]:
# Load the dataset
file_path = '../resources/disease_symptom_data.csv'  
disease_symptom_df = pd.read_csv(file_path)
disease_symptom_df

Unnamed: 0,disease_id,d_name,symptom_id,s_name
0,C0020538,Hypertensive Disease,C0008031,Pain Chest
1,C0020538,Hypertensive Disease,C0392680,Shortness Of Breath
2,C0020538,Hypertensive Disease,C0012833,Dizziness
3,C0020538,Hypertensive Disease,C0004093,Asthenia
4,C0020538,Hypertensive Disease,C0085639,Fall
...,...,...,...,...
1901,C0233472,Affect Labile,C0741453,Bedridden
1902,C0233472,Affect Labile,C0242453,Prostatism
1903,C0011127,Decubitus Ulcer,C0232257,Systolic Murmur
1904,C0011127,Decubitus Ulcer,C0871754,Frail


### One-hot encode the symptom data

In [10]:
# One-hot encoding for symptoms
binary_features = pd.get_dummies(disease_symptom_df['symptom_id'])
disease_symptom_hotcoded = pd.concat(
    [disease_symptom_df['disease_id'], binary_features],
    axis=1
).groupby('disease_id').sum().reset_index()

# Display disease_symptom_hotcoded
disease_symptom_hotcoded

Unnamed: 0,disease_id,C0000727,C0000731,C0000737,C0002416,C0002962,C0003123,C0003126,C0003862,C0003962,...,C1320716,C1321756,C1384489,C1384606,C1405524,C1444773,C1456822,C1511606,C1513183,C1517205
0,C0001175,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C0001418,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C0001511,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C0001973,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C0002395,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,C1258215,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129,C1456784,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130,C1510475,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
131,C1565489,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Generate fake data while keeping in mind some facts:
- We need to fake the data to represent non existing patient charts that are diagnosed for a disease and presented the related symptoms for that disease
- Not all mock patients will have all the symptoms, but they may have any of the symptoms form the related symptom group.
- Research on internet shows that for a diagnosed disease a patient usually has presence of a min of 1/3rd of the related symptoms. While the "1/3rd" threshold might not be universally applicable, it reflects the idea that patients often exhibit a subset of the symptoms associated with a disease. This is consistent with the variability and heterogeneity observed in clinical symptom profiles across different conditions.Diagnostic frameworks like the DSM-5 for mental health conditions and the ACR criteria for lupus often specify a minimum number or subset of symptoms required for diagnosis. DSM-5 examples show that the required subset of symptoms typically ranges from 30% to 50% of the total listed symptoms for a given disorder. ACR Criteria for Lupus shows that patients typically need to present 36% to 40% of the listed symptoms for diagnosis.
- Our coded data has one disease per row and the columns have been coded for True/False for their existence.Therefore we shall randomly generate 1s for the set of related symptoms while ensuring that we have at least 33% of this set having 1s for that row to be a support for the diagnosed disease.

In [None]:
# Function to generate controlled mock data
def generate_minimum_mock_data(hotCoded_df, num_rows, min_percentage):
    # Extract all disease IDs
    disease_id_list = hotCoded_df['disease_id'].tolist()
    
    # Repeat disease IDs to match the desired number of rows
    repeated_disease_ids = [disease_id_list[i % len(disease_id_list)] for i in range(num_rows)]
    
    # Create synthetic data with at least `min_percentage` of allowed columns having `1s`
    data = []
    for disease_id in repeated_disease_ids:
        # Get the original row for the current disease_id
        original_row = hotCoded_df[hotCoded_df['disease_id'] == disease_id].iloc[0]
        
        # Identify allowed columns (symptom columns with `1s` in the original row)
        allowed_columns = original_row[original_row == 1].index.tolist()
        
        # Calculate the minimum number of `1s` required
        min_ones = max(1, int(len(allowed_columns) * min_percentage))  # At least 33% or 1 column
        
        # Randomly choose the minimum required columns for `1s`
        ones_columns = random.sample(allowed_columns, min_ones)
        
        # For remaining allowed columns, assign `0` or `1` randomly
        remaining_columns = [col for col in allowed_columns if col not in ones_columns]
        for col in remaining_columns:
            if random.choice([True, False]):  # Randomly decide to keep as `1`
                ones_columns.append(col)
        
        # Generate the row: `1` for selected columns, `0` for others
        symptoms = [1 if column in ones_columns else 0 for column in hotCoded_df.columns if column != 'disease_id']
        
        # Append the disease_id and symptoms to the data
        data.append([disease_id] + symptoms)

    # Create the DataFrame
    columns = hotCoded_df.columns
    mock_data_df = pd.DataFrame(data, columns=columns)
    
    return mock_data_df

# Generate data with minimum symptoms present for a diseases (`1s`) in allowed columns
mock_data_minimum = generate_minimum_mock_data(
    hotCoded_df=disease_symptom_hotcoded,
    # Number of rows to generate  
    num_rows=5000,
    # least % of symptoms for a disease among the allowed columns that must have `1s`  
    min_percentage=0.33  
)

# Save the generated data to a new CSV file
mock_data_minimum.to_csv("../resources/disease_symptom_hotcoded_train.csv", index=False)
print("Mock data generated and saved to 'disease_symptom_hotcoded_train.csv")


Mock data generated and saved to 'disease_symptom_hotcoded_train.csv


### Inspect what we generated

In [None]:
# Count the number of rows for each disease_id
def count_rows_per_disease(df):
    # Group by 'disease_id' and count the rows
    disease_counts = df.groupby('disease_id').size()
    
    # Convert to a DataFrame for easier inspection
    count_df = disease_counts.reset_index(name='row_count')
    
    return count_df

disease_row_counts = count_rows_per_disease(mock_data_minimum)

disease_row_counts

Unnamed: 0,disease_id,row_count
0,C0001175,38
1,C0001418,38
2,C0001511,38
3,C0001973,38
4,C0002395,38
...,...,...
128,C1258215,37
129,C1456784,37
130,C1510475,37
131,C1565489,37


In [13]:
# Function to inspect all rows for a specific disease_id, filter columns with `1s`, and replace IDs with names
def inspect_disease_with_names(df, disease_id_to_inspect, lookup_df):
    # Filter rows for the specified disease_id
    filtered_rows = df[df['disease_id'] == disease_id_to_inspect]
    
    if filtered_rows.empty:
        print(f"No rows found for disease_id: {disease_id_to_inspect}")
        return pd.DataFrame()  # Return an empty DataFrame
    
    # Identify columns with at least one `1` in the filtered rows
    columns_with_ones = filtered_rows.loc[:, (filtered_rows == 1).any()].columns.tolist()
    
    # Include 'disease_id' column for context
    columns_to_display = ['disease_id'] + [col for col in columns_with_ones if col != 'disease_id']
    
    # Filter the DataFrame to include only rows for the disease_id and columns with `1s`
    filtered_rows = filtered_rows[columns_to_display]
    
    # Look up the `d_name` for the disease_id
    d_name = lookup_df.loc[lookup_df['disease_id'] == disease_id_to_inspect, 'd_name'].values[0]

    # Replace disease_id with its name (`d_name`) for display
    filtered_rows['disease_id'] = d_name
    filtered_rows = filtered_rows.rename(columns={'disease_id': 'Disease Name'})

    # Replace symptom IDs with their names (`s_name`) using the lookup DataFrame
    symptom_mapping = dict(zip(lookup_df['symptom_id'], lookup_df['s_name']))  # Map symptom_id to s_name
    filtered_rows = filtered_rows.rename(columns={col: symptom_mapping[col] for col in filtered_rows.columns if col in symptom_mapping})

    return filtered_rows

# Example usage
lookup_df = disease_symptom_df  # Replace with the DataFrame containing disease_id, d_name, symptom_id, and s_name
disease_id_to_inspect = 'C0001175'  # Replace with the disease_id you want to inspect

filtered_disease_data_named = inspect_disease_with_names(mock_data_minimum, disease_id_to_inspect, lookup_df)

filtered_disease_data_named  # Display the DataFrame with disease and symptom names


Unnamed: 0,Disease Name,Pleuritic Pain,Cough,Diarrhea,Fever,Muscle Hypotonia,Night Sweat,Decreased Body Weight,Chill,Tachypnea,Spontaneous Rupture Of Membranes,Productive Cough,Hypotonic,Patient Non Compliance,Feeling Suicidal
0,Acquiredimmuno-Deficiency Syndrome,0,0,0,1,1,0,0,1,1,0,1,1,1,0
133,Acquiredimmuno-Deficiency Syndrome,1,1,0,1,1,0,1,0,1,0,0,1,1,1
266,Acquiredimmuno-Deficiency Syndrome,1,1,1,0,1,1,0,1,1,1,1,1,1,1
399,Acquiredimmuno-Deficiency Syndrome,0,0,1,1,1,0,0,0,1,1,0,1,0,1
532,Acquiredimmuno-Deficiency Syndrome,1,0,1,1,0,0,1,0,1,1,0,1,1,0
665,Acquiredimmuno-Deficiency Syndrome,1,0,0,1,0,0,1,1,0,1,1,1,1,0
798,Acquiredimmuno-Deficiency Syndrome,0,1,1,1,1,0,1,0,1,1,1,1,0,1
931,Acquiredimmuno-Deficiency Syndrome,1,1,1,0,1,1,1,0,0,1,1,0,1,1
1064,Acquiredimmuno-Deficiency Syndrome,0,0,0,1,1,1,1,1,0,1,1,1,1,0
1197,Acquiredimmuno-Deficiency Syndrome,1,1,0,1,0,0,1,0,1,1,0,0,0,1
