In [1]:
import pandas as pd


TRAIN_DATA = '../data/train.csv'
VAL_DATA = '../data/val.csv'
TEST_DATA = '../data/test.csv'

NUM_FEATURES = [
    'BMI',
    'HDL cholesterol',
    'LDL cholesterol',
    'Total cholesterol',
    'Triglycerides',
    'Diastolic blood pressure'
]
CAT_FEATURES = [
    'Age',
    'Sex',
    'Ever smoked',
    'Snoring',
    'Insomnia',
    'Daytime napping',
    'Chronotype',
    'Sleep duration',
]
DISEASES = [
    'Asthma',
    'Cataract',
    'Diabetes',
    'GERD',
    'Hay-fever & Eczema',
    'Major depression',
    'Myocardial infarction',
    'Osteoarthritis',
    'Pneumonia',
    'Stroke'
]

In [2]:
# Load data
DF_TRAIN = pd.read_csv(TRAIN_DATA)
DF_VAL = pd.read_csv(VAL_DATA)
DF_TEST = pd.read_csv(TEST_DATA)

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
import json
import os


def train_and_predict(X_train, y_train, X_val, y_val, features, disease_name):
    # Separate numeric and categorical features
    numeric_features = [col for col in features if col in NUM_FEATURES]
    categorical_features = [col for col in features if col in CAT_FEATURES]
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Create and train model
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42))
    ])
    
    model.fit(X_train[features], y_train)
    
    # Predict on validation set
    y_pred = model.predict_proba(X_val[features])[:, 1]
    
    # Calculate AUROC and AUPRC
    auroc = roc_auc_score(y_val, y_pred)
    auprc = average_precision_score(y_val, y_pred)
    
    # Print AUROC and AUPRC
    print(f"Disease: {disease_name}")
    print(f"  AUROC on validation set: {auroc:.4f}")
    print(f"  AUPRC on validation set: {auprc:.4f}")
    
    return model, y_pred, auroc, auprc


def save_scores(eids, y_true, y_scores, disease, feature_set, out_dir):
    os.makedirs(os.path.join(out_dir, feature_set), exist_ok=True)
    trait_name = disease.lower().replace(" ", "-")
    filename = f"rs_{trait_name}.json"
    full_path = os.path.join(out_dir, feature_set, filename)
    
    with open(full_path, "w") as f:
        json.dump({
            "eids": eids.tolist(),
            "y_true": y_true.tolist(),
            "y_scores": y_scores.tolist()
        }, f)
    print(f"Scores saved successfully: {full_path}\n")

In [5]:
# Baseline features
baseline_features = ['Age', 'Sex', 'BMI']

# Expanded features
expanded_features = NUM_FEATURES + CAT_FEATURES

# Train models, predict, and save predictions for each disease
out_dir = '../scores/log_reg'
results = {'baseline': {}, 'expanded': {}}

for disease in DISEASES:
    print(f"Training models for {disease}")
    
    # Baseline model
    baseline_model, baseline_pred, baseline_auroc, baseline_auprc = train_and_predict(
        DF_TRAIN, DF_TRAIN[disease], DF_VAL, DF_VAL[disease], baseline_features, f"{disease} (Baseline)"
    )
    results['baseline'][disease] = {'model': baseline_model, 'auroc': baseline_auroc, 'auprc': baseline_auprc}
    save_scores(DF_VAL['eid'].values, DF_VAL[disease].values, baseline_pred, disease, 'baseline', out_dir)
    
    # Expanded model
    expanded_model, expanded_pred, expanded_auroc, expanded_auprc = train_and_predict(
        DF_TRAIN, DF_TRAIN[disease], DF_VAL, DF_VAL[disease], expanded_features, f"{disease} (Expanded)"
    )
    results['expanded'][disease] = {'model': expanded_model, 'auroc': expanded_auroc, 'auprc': expanded_auprc}
    save_scores(DF_VAL['eid'].values, DF_VAL[disease].values, expanded_pred, disease, 'expanded', out_dir)

# Print summary of results
print("Summary of AUROC and AUPRC scores:")
for scenario in ['baseline', 'expanded']:
    print(f"\n{scenario.capitalize()} scenario:")
    for disease, result in results[scenario].items():
        print(f"  {disease}:")
        print(f"    AUROC: {result['auroc']:.4f}")
        print(f"    AUPRC: {result['auprc']:.4f}")

print(f"\nAll scores have been saved in the '{out_dir}' directory.")

Training models for Asthma


Disease: Asthma (Baseline)
  AUROC on validation set: 0.5502
  AUPRC on validation set: 0.1816
Scores saved successfully: ../scores/log_reg/baseline/rs_asthma.json

Disease: Asthma (Expanded)
  AUROC on validation set: 0.5754
  AUPRC on validation set: 0.1940
Scores saved successfully: ../scores/log_reg/expanded/rs_asthma.json

Training models for Cataract
Disease: Cataract (Baseline)
  AUROC on validation set: 0.7451
  AUPRC on validation set: 0.3353
Scores saved successfully: ../scores/log_reg/baseline/rs_cataract.json

Disease: Cataract (Expanded)
  AUROC on validation set: 0.7479
  AUPRC on validation set: 0.3404
Scores saved successfully: ../scores/log_reg/expanded/rs_cataract.json

Training models for Diabetes
Disease: Diabetes (Baseline)
  AUROC on validation set: 0.7507
  AUPRC on validation set: 0.2670
Scores saved successfully: ../scores/log_reg/baseline/rs_diabetes.json

Disease: Diabetes (Expanded)
  AUROC on validation set: 0.8098
  AUPRC on validation set: 0.3804
Scores s