In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("participant_info.csv")

In [2]:
df.head()

Unnamed: 0,SID,AGE,GENDER,BMI,OAHI,AHI,Mean_SaO2,Arousal Index,MEDICAL_HISTORY,Sleep_Disorders
0,S002,65.9,M,27.0,19,19,91%,98,"Asthma, Body Pain, GERD, Hypertension, Sleep A...",OSA
1,S003,29.38,F,51.0,34,37,95%,28,,"snoring, sleep apnea, difficulty breathing, sn..."
2,S004,55.66,F,41.0,63,99,89%,109,"Arrhythmia, Body Pain, Depression, Dyspnea, GERD",difficulty breathing
3,S005,49.12,F,43.0,19,20,95%,28,"Asthma, Body Pain, Depression, Diabetes, Dyspn...",OSA
4,S006,36.91,F,22.0,4,5,97%,34,"Depression, Sleep Apnea",OSA


In [3]:
# Clean and preprocess MEDICAL_HISTORY and Sleep_Disorders columns

# Replace 'None' with an empty string for easier processing
df['MEDICAL_HISTORY'] = df['MEDICAL_HISTORY'].fillna('')
df['Sleep_Disorders'] = df['Sleep_Disorders'].fillna('')

# Tokenize and split the medical history and sleep disorders
df['MEDICAL_HISTORY'] = df['MEDICAL_HISTORY'].str.split(', ')
df['Sleep_Disorders'] = df['Sleep_Disorders'].str.split(', ')

# Extract unique conditions and disorders
unique_medical_conditions = set(condition for sublist in df['MEDICAL_HISTORY'] for condition in sublist)
unique_sleep_disorders = set(disorder for sublist in df['Sleep_Disorders'] for disorder in sublist)

# Create binary columns for each unique medical condition and sleep disorder
for condition in unique_medical_conditions:
    df[f'MH_{condition}'] = df['MEDICAL_HISTORY'].apply(lambda x: 1 if condition in x else 0)
for disorder in unique_sleep_disorders:
    df[f'SD_{disorder}'] = df['Sleep_Disorders'].apply(lambda x: 1 if disorder in x else 0)

# Drop the original columns to simplify the dataset for correlation analysis
binary_data = df.drop(columns=['MEDICAL_HISTORY', 'Sleep_Disorders'])

# Drop non-numeric columns before correlation computation
numeric_columns = binary_data.select_dtypes(include=['number'])

# Compute correlation matrix between medical conditions and sleep disorders
correlation_matrix = numeric_columns.corr().filter(like='MH_', axis=0).filter(like='SD_', axis=1)

# Display the updated correlation matrix
#import ace_tools as tools; tools.display_dataframe_to_user(name="Corrected Correlation Matrix", dataframe=correlation_matrix)

print(correlation_matrix)


                 SD_MCI and Sleep apnea       SD_   SD_none  \
MH_                            0.259998  0.027546  0.369564   
MH_Anxiety                    -0.062675  0.123791 -0.089087   
MH_CAD                        -0.035333 -0.088820 -0.050223   
MH_Hypertension               -0.102534 -0.005054 -0.145743   
MH_Asthma                     -0.048676 -0.122362 -0.069189   
MH_Sleep Apnea                -0.054929 -0.138080 -0.078077   
MH_Migraine                   -0.029637 -0.074501 -0.042126   
MH_GERD                       -0.058026  0.048622 -0.082479   
MH_Diabetes                   -0.050252 -0.126323 -0.071429   
MH_Body Pain                  -0.075378  0.073688 -0.107143   
MH_Depression                 -0.080362 -0.202013 -0.114227   
MH_Dyspnea                    -0.035333 -0.088820 -0.050223   
MH_Arrhythmia                 -0.035333 -0.088820 -0.050223   

                 SD_difficulty breathing  SD_diffifulty breathing  SD_snoring  \
MH_                        -8.759772

In [4]:
# Flatten the correlation matrix to make it easier to find the strongest correlations
flattened_corr = correlation_matrix.unstack().reset_index()
flattened_corr.columns = ['Medical_History', 'Sleep_Disorder', 'Correlation']

# Sort by absolute value of correlations to find the strongest positive and negative correlations
flattened_corr['Abs_Correlation'] = flattened_corr['Correlation'].abs()
sorted_corr = flattened_corr.sort_values(by='Abs_Correlation', ascending=False)

# Extract the top positive and negative correlations
top_positive = sorted_corr[sorted_corr['Correlation'] > 0].head(5)
top_negative = sorted_corr[sorted_corr['Correlation'] < 0].head(5)

# Print top positive correlations
print("Top Positive Correlations:")
print(top_positive[['Medical_History', 'Sleep_Disorder', 'Correlation']])

# Print top negative correlations
print("\nTop Negative Correlations:")
print(top_negative[['Medical_History', 'Sleep_Disorder', 'Correlation']])



Top Positive Correlations:
    Medical_History  Sleep_Disorder  Correlation
83           SD_OSA  MH_Sleep Apnea     0.471870
26          SD_none             MH_     0.369564
136      SD_fatigue     MH_Migraine     0.340825
344  SD_hypersomnia     MH_Migraine     0.340825
261      SD_bruxism      MH_Anxiety     0.327327

Top Negative Correlations:
          Medical_History   Sleep_Disorder  Correlation
239                SD_EDS   MH_Sleep Apnea    -0.291946
70             SD_snoring   MH_Sleep Apnea    -0.222100
263            SD_bruxism  MH_Hypertension    -0.208248
23                    SD_    MH_Depression    -0.202013
99   SD_morning headaches      MH_Diabetes    -0.175781
