In [10]:
import pandas as pd
import numpy as np

# Load the dataset
filepath = '../ich_data_w_scores_modified.csv'
ich_data = pd.read_csv(filepath)

# Define the subgroups
deceased = ich_data[ich_data['MORT90'] == 1]  # Patients who died within 90 days
good_outcome = ich_data[ich_data['MRS90'] <= 3]  # mRS score <= 3, good functional outcome
poor_outcome = ich_data[ich_data['MRS90'] > 3]  # mRS score > 3, poor functional outcome

# Function to calculate and print statistics for a given dataset
def calculate_statistics(data, title):
    total_patients = len(data)
    print(f"\n{title} (N = {total_patients})")
    print("-" * 40)
    
    # Demographics
    age_mean = data['AGE'].mean()
    age_std = data['AGE'].std()
    age_80_plus = (data['AGE'] >= 80).sum()
    age_below_80 = (data['AGE'] < 80).sum()

    gender_counts = data['GENDER'].value_counts()
    gender_percentages = (gender_counts / total_patients * 100).round(2)

    # Health Behaviors
    smoking_counts = data['SMOKING'].value_counts()
    smoking_percentages = (smoking_counts / total_patients * 100).round(2)

    # Comorbidities
    hypertension_counts = data['HTNADM'].value_counts()
    hypertension_percentages = (hypertension_counts / total_patients * 100).round(2)
    diabetes_counts = data['DMADM'].value_counts()
    diabetes_percentages = (diabetes_counts / total_patients * 100).round(2)

    # Clinical measures
    gcs_distribution = data['GCSADM'].value_counts().sort_index()
    ich_volume_median = data['VOLUME1'].median()
    ich_volume_iqr = data['VOLUME1'].quantile(0.75) - data['VOLUME1'].quantile(0.25)
    ivh_counts = data['IVH'].value_counts()
    ivh_percentages = (ivh_counts / total_patients * 100).round(2)

    # Additional clinical data
    htn_type_counts = data['HTNTYPE'].value_counts()
    dyslipidemia_counts = data['DYSLIPADM'].value_counts()
    stroke_counts = data['STROKE'].value_counts()
    af_counts = data['AF'].value_counts()
    cad_counts = data['CAD'].value_counts()
    diabetes_type_counts = data['DMTYPE'].value_counts()

    # Lab results
    lab_results = {
        'HbA1c': data['HbA1c'].mean(),
        'Cholesterol': data['CHOL'].mean(),
        'Triglycerides': data['TRIG'].mean(),
        'HDL': data['HDL'].mean(),
        'LDL': data['LDL'].mean(),
        'Platelets': data['PLT'].mean(),
        'INR': data['INR'].mean(),
        'APTT': data['APTT'].mean()
    }

    # Printing results
    print(f"Age Mean: {age_mean:.1f}, Std Dev: {age_std:.1f}")
    print(f"Age ≥80 Count: {age_80_plus}, <80 Count: {age_below_80}")
    print(f"Gender Counts: {gender_counts.to_dict()}")
    print(f"Gender Percentages: {gender_percentages.to_dict()}")
    print(f"Smoking Counts: {smoking_counts.to_dict()}, Percentages: {smoking_percentages.to_dict()}")
    print(f"Hypertension Counts: {hypertension_counts.to_dict()}, Percentages: {hypertension_percentages.to_dict()}")
    print(f"Diabetes Counts: {diabetes_counts.to_dict()}, Percentages: {diabetes_percentages.to_dict()}")
    print(f"GCS Distribution: {gcs_distribution.to_dict()}")
    print(f"ICH Volume Median: {ich_volume_median}, IQR: {ich_volume_iqr}")
    print(f"IVH Counts: {ivh_counts.to_dict()}, Percentages: {ivh_percentages.to_dict()}")
    print(f"HTN Type, Dyslipidemia, Stroke, AF, CAD Counts:")
    print(htn_type_counts.to_dict(), dyslipidemia_counts.to_dict(), stroke_counts.to_dict(), af_counts.to_dict(), cad_counts.to_dict())
    print(f"Diabetes Type Counts: {diabetes_type_counts.to_dict()}")
    print("Lab Results:", lab_results)

# Calculate statistics for the overall dataset
calculate_statistics(ich_data, "Overall Patient Statistics")

# Calculate statistics for subgroups
calculate_statistics(deceased, "Deceased Patients")
calculate_statistics(good_outcome, "Patients with Good Outcome")
calculate_statistics(poor_outcome, "Patients with Poor Outcome")



Overall Patient Statistics (N = 1501)
----------------------------------------
Age Mean: 50.8, Std Dev: 13.0
Age ≥80 Count: 55, <80 Count: 1446
Gender Counts: {1: 1257, 0: 244}
Gender Percentages: {1: 83.74, 0: 16.26}
Smoking Counts: {0: 1342, 1: 159}, Percentages: {0: 89.41, 1: 10.59}
Hypertension Counts: {1: 1258, 0: 243}, Percentages: {1: 83.81, 0: 16.19}
Diabetes Counts: {0: 988, 1: 513}, Percentages: {0: 65.82, 1: 34.18}
GCS Distribution: {3: 87, 4: 30, 5: 26, 6: 39, 7: 54, 8: 50, 9: 61, 10: 49, 11: 54, 12: 49, 13: 109, 14: 146, 15: 747}
ICH Volume Median: 7.5, IQR: 12.600000000000001
IVH Counts: {0: 1010, 1: 491}, Percentages: {0: 67.29, 1: 32.71}
HTN Type, Dyslipidemia, Stroke, AF, CAD Counts:
{1: 953, 2: 305, 0: 243} {0: 1074, 1: 427} {0: 1400, 1: 101} {0: 1452, 1: 49} {0: 1422, 1: 79}
Diabetes Type Counts: {0: 755, 1: 513, 2: 233}
Lab Results: {'HbA1c': 6.47771900025583, 'Cholesterol': 4.632579652549634, 'Triglycerides': 1.5994532554083942, 'HDL': 1.0611537674750167, 'LDL': 2

In [6]:

# Age Mean: 50.7981345769487, Std Dev: 13.022233161707595
# Age ≥80 Count: 55, <80 Count: 1446
# Gender Counts: GENDER
# 1    1257
# 0     244
# Name: count, dtype: int64
# Gender Percentages: GENDER
# 1    83.744171
# 0    16.255829
# Name: count, dtype: float64
# Smoking Counts: SMOKING
# 0    1342
# 1     159
# Name: count, dtype: int64, Percentages: SMOKING
# 0    89.407062
# 1    10.592938
# Name: count, dtype: float64
# Hypertension Counts: HTNADM
# 1    1258
# 0     243
# Name: count, dtype: int64, Percentages: HTNADM
# 1    83.810793
# 0    16.189207
# Name: count, dtype: float64
# Diabetes Counts: DMADM
# 0    988
# 1    513
# Name: count, dtype: int64, Percentages: DMADM
# 0    65.822785
# 1    34.177215
# Name: count, dtype: float64
# GCS Distribution: GCSADM
# 3      87
# 4      30
# 5      26
# 6      39
# 7      54
# 8      50
# 9      61
# 10     49
# 11     54
# 12     49
# 13    109
# 14    146
# 15    747
# Name: count, dtype: int64
# ICH Volume Median: 7.5, IQR: 12.600000000000001
# IVH Counts: IVH
# 0    1010
# 1     491
# Name: count, dtype: int64, Percentages: IVH
# 0    67.288474
# 1    32.711526
# Name: count, dtype: float64
# HTN Type, Dyslipidemia, Stroke, AF, CAD Counts:
# HTNTYPE
# 1    953
# 2    305
# 0    243
# Name: count, dtype: int64 DYSLIPADM
# 0    1074
# 1     427
# Name: count, dtype: int64 STROKE
# 0    1400
# 1     101
# Name: count, dtype: int64 AF
# 0    1452
# 1      49
# Name: count, dtype: int64 CAD
# 0    1422
# 1      79
# Name: count, dtype: int64
# Diabetes Type Counts: DMTYPE
# 0    755
# 1    513
# 2    233
# Name: count, dtype: int64
# Mean Blood Test Results:
# HbA1c: 6.47771900025583, Cholesterol: 4.632579652549634, Triglycerides: 1.5994532554083942
# HDL: 1.0611537674750167, LDL: 2.892067339593604, Platelets: 257.85191249307127
# INR: 1.0689456603857428, APTT: 27.875339503957363

In [9]:
import pandas as pd
import numpy as np

# Load the dataset
filepath = '../ich_data_w_scores_modified.csv'
ich_data = pd.read_csv(filepath)

# Define the subgroups
deceased = ich_data[ich_data['MORT90'] == 1]  # Patients who died within 90 days
good_outcome = ich_data[ich_data['MRS90'] <= 3]  # mRS score <= 3, good functional outcome
poor_outcome = ich_data[ich_data['MRS90'] > 3]  # mRS score > 3, poor functional outcome

# Function to calculate statistics for a given dataset
def calculate_statistics(data, title):
    total_patients = len(data)
    print(f"\n{title} (N = {total_patients})")
    print("-" * 40)

    # Age statistics
    age_mean = data['AGE'].mean()
    age_std = data['AGE'].std()
    print(f"Age Mean: {age_mean:.1f}, Std Dev: {age_std:.1f}")

    # Gender statistics
    gender_counts = data['GENDER'].value_counts()
    gender_percentages = gender_counts / total_patients * 100
    print(f"Gender Counts: {gender_counts}\nGender Percentages: {gender_percentages}")

    # Comorbidities
    hypertension_counts = data['HTNADM'].value_counts()
    hypertension_percentages = hypertension_counts / total_patients * 100
    diabetes_counts = data['DMADM'].value_counts()
    diabetes_percentages = diabetes_counts / total_patients * 100
    print(f"Hypertension Counts: {hypertension_counts}, Percentages: {hypertension_percentages}")
    print(f"Diabetes Counts: {diabetes_counts}, Percentages: {diabetes_percentages}")

    # Smoking status
    smoking_counts = data['SMOKING'].value_counts()
    smoking_percentages = smoking_counts / total_patients * 100
    print(f"Smoking Counts: {smoking_counts}, Percentages: {smoking_percentages}")

    # ICH Volume
    ich_volume_median = data['VOLUME1'].median()
    ich_volume_iqr = data['VOLUME1'].quantile(0.75) - data['VOLUME1'].quantile(0.25)
    print(f"ICH Volume Median: {ich_volume_median}, IQR: {ich_volume_iqr}")

    # IVH presence
    ivh_counts = data['IVH'].value_counts()
    ivh_percentages = ivh_counts / total_patients * 100
    print(f"IVH Counts: {ivh_counts}, Percentages: {ivh_percentages}")

# Calculate statistics for the overall dataset
calculate_statistics(ich_data, "Overall Patient Statistics")

# Calculate statistics for subgroups
calculate_statistics(deceased, "Deceased Patients")
calculate_statistics(good_outcome, "Patients with Good Outcome")
calculate_statistics(poor_outcome, "Patients with Poor Outcome")



Overall Patient Statistics (N = 1501)
----------------------------------------
Age Mean: 50.8, Std Dev: 13.0
Gender Counts: GENDER
1    1257
0     244
Name: count, dtype: int64
Gender Percentages: GENDER
1    83.744171
0    16.255829
Name: count, dtype: float64
Hypertension Counts: HTNADM
1    1258
0     243
Name: count, dtype: int64, Percentages: HTNADM
1    83.810793
0    16.189207
Name: count, dtype: float64
Diabetes Counts: DMADM
0    988
1    513
Name: count, dtype: int64, Percentages: DMADM
0    65.822785
1    34.177215
Name: count, dtype: float64
Smoking Counts: SMOKING
0    1342
1     159
Name: count, dtype: int64, Percentages: SMOKING
0    89.407062
1    10.592938
Name: count, dtype: float64
ICH Volume Median: 7.5, IQR: 12.600000000000001
IVH Counts: IVH
0    1010
1     491
Name: count, dtype: int64, Percentages: IVH
0    67.288474
1    32.711526
Name: count, dtype: float64

Deceased Patients (N = 222)
----------------------------------------
Age Mean: 52.1, Std Dev: 15.7
Gen

In [14]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Load the dataset
filepath = '../ich_data_w_scores_modified.csv'
ich_data = pd.read_csv(filepath)

# Define the subgroups
deceased = ich_data[ich_data['MORT90'] == 1]
good_outcome = ich_data[ich_data['MRS90'] <= 3]
poor_outcome = ich_data[ich_data['MRS90'] > 3]

# Function to calculate statistics for a given dataset
def calculate_statistics(data, title, total_data=None):
    if total_data is not None:
        excluded_data = total_data.drop(data.index)
    else:
        excluded_data = data
    
    total_patients = len(data)
    print(f"\n{title} (N = {total_patients})")
    print("-" * 40)

    variables = ['AGE', 'GENDER', 'HTNADM', 'DMADM', 'SMOKING', 'VOLUME1', 'IVH']
    results = []
    for var in variables:
        if data[var].dtype == 'O' or len(data[var].unique()) <= 10:  # Categorical data
            counts = data[var].value_counts()
            percentages = counts / total_patients * 100
            print(f"{var} Counts: {counts}\n{var} Percentages: {percentages}")
            if total_data is not None:
                contingency_table = pd.crosstab(data[var], excluded_data[var])
                if contingency_table.size == 0 or (contingency_table.sum(axis=0) == 0).any() or (contingency_table.sum(axis=1) == 0).any():
                    print(f"{var}: Insufficient data for chi-squared test")
                else:
                    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
                    results.append([var, p_value, "Chi-squared"])
        else:  # Continuous data
            mean = data[var].mean()
            std = data[var].std()
            median = data[var].median()
            iqr = data[var].quantile(0.75) - data[var].quantile(0.25)
            print(f"{var} Mean: {mean:.2f}, Std: {std:.2f}, Median: {median}, IQR: {iqr}")
            if total_data is not None:
                _, p_value = stats.ttest_ind(data[var], excluded_data[var], nan_policy='omit')
                results.append([var, p_value, "T-test"])
    
    if total_data is not None and results:
        markdown_table = "| Variable | P-Value | Test Type |\n|---|---|---|\n"
        for result in results:
            markdown_table += f"| {result[0]} | {result[1]:.3f} | {result[2]} |\n"
        print(markdown_table)

# Calculate statistics for the overall dataset without comparative statistics
calculate_statistics(ich_data, "Overall Patient Statistics")

# Calculate statistics for subgroups with comparative statistics
calculate_statistics(deceased, "Deceased Patients", ich_data)
calculate_statistics(good_outcome, "Patients with Good Outcome", ich_data)
calculate_statistics(poor_outcome, "Patients with Poor Outcome", ich_data)



Overall Patient Statistics (N = 1501)
----------------------------------------
AGE Mean: 50.80, Std: 13.02, Median: 49.0, IQR: 16.0
GENDER Counts: GENDER
1    1257
0     244
Name: count, dtype: int64
GENDER Percentages: GENDER
1    83.744171
0    16.255829
Name: count, dtype: float64
HTNADM Counts: HTNADM
1    1258
0     243
Name: count, dtype: int64
HTNADM Percentages: HTNADM
1    83.810793
0    16.189207
Name: count, dtype: float64
DMADM Counts: DMADM
0    988
1    513
Name: count, dtype: int64
DMADM Percentages: DMADM
0    65.822785
1    34.177215
Name: count, dtype: float64
SMOKING Counts: SMOKING
0    1342
1     159
Name: count, dtype: int64
SMOKING Percentages: SMOKING
0    89.407062
1    10.592938
Name: count, dtype: float64
VOLUME1 Mean: 11.85, Std: 12.86, Median: 7.5, IQR: 12.600000000000001
IVH Counts: IVH
0    1010
1     491
Name: count, dtype: int64
IVH Percentages: IVH
0    67.288474
1    32.711526
Name: count, dtype: float64

Deceased Patients (N = 222)
-----------------