In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization


In [71]:
conditions = pd.read_csv('10k_synthea_covid19_csv/conditions.csv')
immunizations = pd.read_csv('10k_synthea_covid19_csv/immunizations.csv')
medications = pd.read_csv('10k_synthea_covid19_csv/medications.csv')
observations = pd.read_csv('10k_synthea_covid19_csv/observations.csv')
patients = pd.read_csv('10k_synthea_covid19_csv/patients.csv')
allergies = pd.read_csv(r"10k_synthea_covid19_csv/allergies.csv")
careplans = pd.read_csv('10k_synthea_covid19_csv/careplans.csv')

# Ensure 'PATIENT' column exists in all datasets
if 'Id' in patients.columns:
    patients = patients.rename(columns={'Id': 'PATIENT'})

# Check for PATIENT column consistency
assert 'PATIENT' in patients.columns, "PATIENT column missing in patients dataset"
assert 'PATIENT' in observations.columns, "PATIENT column missing in observations dataset"
assert 'PATIENT' in conditions.columns, "PATIENT column missing in conditions dataset"


In [72]:

# Step 1: Consolidate all data into a single row per patient
# One-hot encode categorical features for each dataset
# Observations
observations_onehot = pd.get_dummies(observations[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='OBSERVATION')
observations_features = observations_onehot.groupby('PATIENT').sum().reset_index()

# Medications
medications_onehot = pd.get_dummies(medications[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='MEDICATION')
medications_features = medications_onehot.groupby('PATIENT').sum().reset_index()

# Care Plans
careplans_onehot = pd.get_dummies(careplans[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='CAREPLAN')
careplans_features = careplans_onehot.groupby('PATIENT').sum().reset_index()

# Immunizations
immunizations_onehot = pd.get_dummies(immunizations[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='IMMUNIZATION')
immunizations_features = immunizations_onehot.groupby('PATIENT').sum().reset_index()

# Conditions
conditions_onehot = pd.get_dummies(conditions[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='CONDITION')
conditions_features = conditions_onehot.groupby('PATIENT').sum().reset_index()

# Allergies
allergies_onehot = pd.get_dummies(allergies[['PATIENT', 'DESCRIPTION']], columns=['DESCRIPTION'], prefix='ALLERGY')
allergies_features = allergies_onehot.groupby('PATIENT').sum().reset_index()


In [78]:
# Step 2: Merge all features into a single DataFrame
patients = patients.rename(columns={'Id': 'PATIENT'})  # Rename Id to PATIENT for consistency
merged_features = patients.copy()
merged_features = merged_features.merge(observations_features, on='PATIENT', how='left')
merged_features = merged_features.merge(medications_features, on='PATIENT', how='left')
merged_features = merged_features.merge(careplans_features, on='PATIENT', how='left')
merged_features = merged_features.merge(immunizations_features, on='PATIENT', how='left')
merged_features = merged_features.merge(conditions_features, on='PATIENT', how='left')
merged_features = merged_features.merge(allergies_features, on='PATIENT', how='left')

# Fill missing values
merged_features.fillna(0, inplace=True)

# Convert date fields to meaningful numeric features
merged_features['BIRTHDATE'] = pd.to_datetime(merged_features['BIRTHDATE'], errors='coerce')
merged_features['DEATHDATE'] = pd.to_datetime(merged_features['DEATHDATE'], errors='coerce')
merged_features['AGE'] = (pd.Timestamp.now().year - merged_features['BIRTHDATE'].dt.year).fillna(0).astype(int)
merged_features['AGE_AT_DEATH'] = (merged_features['DEATHDATE'].dt.year - merged_features['BIRTHDATE'].dt.year).fillna(0).astype(int)

# Remove unnecessary columns
irrelevant_columns = ['SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'ADDRESS', 'CITY', 'STATE', 'ZIP']
merged_features.drop(columns=irrelevant_columns, inplace=True, errors='ignore')

# One-hot encode remaining categorical columns, including RACE, GENDER, and ETHNICITY, but exclude 'PATIENT'
categorical_columns = merged_features.select_dtypes(include=['object', 'category']).columns
categorical_columns = categorical_columns[categorical_columns != 'PATIENT']  # Exclude 'PATIENT' column
print(f"Categorical columns being encoded: {categorical_columns}")  # Debugging step

merged_features = pd.get_dummies(merged_features, columns=categorical_columns, drop_first=True)


Categorical columns being encoded: Index(['RACE', 'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'COUNTY'], dtype='object')


In [79]:

# Step 3: Add target variable
# Determine COVID-19 status
covid_code = "840539006"  # Example ICD-10 code for COVID-19
covid_patients = conditions[conditions['CODE'] == covid_code]['PATIENT'].unique()
assert 'PATIENT' in merged_features.columns, "PATIENT column missing in merged_features"
merged_features['COVID'] = merged_features['PATIENT'].isin(covid_patients).astype(int)

# Determine death status
merged_features['DIED'] = merged_features['DEATHDATE'].notna().astype(int)

# Classify patients into four categories
merged_features['CLASS'] = merged_features.apply(
    lambda row: (
        'had_covid_died' if row['COVID'] and row['DIED'] else
        'had_covid_lived' if row['COVID'] and not row['DIED'] else
        'no_covid_died' if not row['COVID'] and row['DIED'] else
        'no_covid_lived'
    ),
    axis=1
)

In [80]:

# Step 4: Prepare dataset for training
X = merged_features.drop(columns=['PATIENT', 'COVID', 'DIED', 'BIRTHDATE', 'DEATHDATE', 'CLASS'])
y = merged_features['CLASS']

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [None]:

# Step 5: Define CNN model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)

# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Save model
model.save('covid_mortality_full_model.h5')
print("Model saved as 'covid_mortality_full_model.h5'")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30

KeyboardInterrupt: 



ValueError: X has 4 features, but StandardScaler is expecting 7 features as input.