In [2]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import TimeSeriesSplit
# from tsfresh import extract_features, select_features
# from tsfresh.utilities.dataframe_functions import impute
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
# import json

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
# from xgboost import XGBClassifier

In [3]:
# ============================================
# COMPREHENSIVE AUTOIMMUNE LIST 
# ============================================

autoimmune_conditions = [
    # Endocrine
    "type 1 diabetes", "diabetes type 1", "graves' disease", "graves disease",
    "hashimoto's thyroiditis", "hashimoto's disease", "hashimoto", "addison's disease",
    "addison disease", "autoimmune thyroiditis", "thyroiditis",
    
    # Rheumatologic / Connective Tissue
    "rheumatoid arthritis", "rheumatoid", "systemic lupus erythematosus", "lupus",
    "sle", "discoid lupus", "cutaneous lupus", "scleroderma", "systemic sclerosis",
    "limited scleroderma", "crest syndrome", "sjogren's syndrome", "sjogren", "sjögren",
    "mixed connective tissue disease", "mctd", "undifferentiated connective tissue disease",
    "uctd", "polymyositis", "dermatomyositis", "polymyalgia rheumatica",
    "ankylosing spondylitis", "axial spondyloarthritis", "psoriatic arthritis",
    "reactive arthritis", "enteropathic arthritis", "palindromic rheumatism",
    "adult still's disease", "stills disease", "relapsing polychondritis",
    "behcet's disease", "behcet's syndrome", "behcets",
    
    # Vasculitis
    "vasculitis", "granulomatosis with polyangiitis", "wegener's granulomatosis", "gpa",
    "microscopic polyangiitis", "eosinophilic granulomatosis with polyangiitis",
    "churg-strauss syndrome", "takayasu's arteritis", "giant cell arteritis",
    "temporal arteritis", "henoch-schonlein purpura", "henoch-schönlein purpura",
    "iga vasculitis", "kawasaki disease", "polyarteritis nodosa",
    
    # Gastrointestinal
    "crohn's disease", "crohn disease", "crohns", "ulcerative colitis",
    "inflammatory bowel disease", "ibd", "celiac disease", "celiac", "coeliac",
    "autoimmune hepatitis", "primary biliary cholangitis", "primary biliary cirrhosis",
    "pbc", "primary sclerosing cholangitis", "psc", "autoimmune pancreatitis",
    "autoimmune gastritis", "pernicious anemia", "autoimmune atrophic gastritis",
    
    # Neurological
    "multiple sclerosis", "guillain-barre syndrome", "guillain barre",
    "chronic inflammatory demyelinating polyneuropathy", "cidp", "myasthenia gravis",
    "lambert-eaton myasthenic syndrome", "stiff person syndrome", "transverse myelitis",
    "neuromyelitis optica", "autoimmune encephalitis", "anti-nmda receptor encephalitis",
    "hashimoto's encephalopathy",
    
    # Hematologic
    "autoimmune hemolytic anemia", "immune thrombocytopenic purpura", "itp",
    "immune thrombocytopenia", "antiphospholipid syndrome", "antiphospholipid antibody syndrome",
    "aps", "aplastic anemia", "autoimmune neutropenia",
    
    # Dermatologic
    "psoriasis", "vitiligo", "alopecia areata", "pemphigus vulgaris", "pemphigus",
    "bullous pemphigoid", "pemphigoid", "dermatitis herpetiformis", "lichen planus",
    "lichen sclerosus", "morphea", "linear scleroderma",
    
    # Renal
    "goodpasture syndrome", "iga nephropathy", "lupus nephritis", "anti-gbm disease",
    
    # Cardiac
    "autoimmune myocarditis", "rheumatic fever", "rheumatic heart disease",
    
    # Pulmonary
    "sarcoidosis", "eosinophilic pneumonia",
    
    # Ophthalmologic
    "autoimmune uveitis", "graves' ophthalmopathy", "thyroid eye disease",
    
    # Other/Multi-system
    "igg4-related disease", "autoimmune autonomic neuropathy", "autoimmune inner ear disease",
    "raynaud's disease", "autoimmune progesterone dermatitis", "autoimmune urticaria",
    "chronic autoimmune urticaria", "pans", "pandas",
    
    # General indicators
    "autoimmune", "auto-immune", "autoinflammatory", "auto-inflammatory"
]

In [4]:
# ============================================
# LOAD ENRICHED FEATURE MATRIX
# ============================================

# use enriched feature matrix 
# df = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/feature_matrix_enriched.csv")
df = pd.read_csv("../../data/processed/flaredown/feature_matrix_enriched.csv")

# ============================================
# PREPARE DATA
# ============================================
# check which columns to drop
cols_to_drop = ['user_id', 'checkin_date', 'autoimmune_label']

# remove autoimmune condition columns 
autoimmune_condition_cols = [col for col in df.columns if col in autoimmune_conditions]
print(f"\nRemoving {len(autoimmune_condition_cols)} autoimmune condition columns")

X = df.drop(columns=cols_to_drop + autoimmune_condition_cols, errors='ignore')
y = df['autoimmune_label']
groups = df['user_id']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"Positive rate: {y.mean():.2%}")

# ============================================
# PATIENT-LEVEL TRAIN/TEST SPLIT
# ============================================

gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    print(f"\nEncoding {len(categorical_cols)} categorical columns: {categorical_cols}")
    X_train = pd.get_dummies(X_train, columns=categorical_cols)
    X_test = pd.get_dummies(X_test, columns=categorical_cols)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

print(f"\nAfter encoding - Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

# ============================================
# TEST 2: WITHOUT has_autoimmune FEATURE
# ============================================

print("\n" + "="*60)
print("TEST 2: WITHOUT has_autoimmune feature")
print("="*60)

# remove has_autoimmune related columns
has_autoimmune_cols = [col for col in X_train.columns if 'has_autoimmune' in col.lower()]
print(f"\nRemoving {len(has_autoimmune_cols)} has_autoimmune columns: {has_autoimmune_cols}")

X_train_no_ai = X_train.drop(columns=has_autoimmune_cols, errors='ignore')
X_test_no_ai = X_test.drop(columns=has_autoimmune_cols, errors='ignore')

print(f"New shape - Train: {X_train_no_ai.shape}, Test: {X_test_no_ai.shape}")
# X_train_no_ai.head(5)

# # train Random Forest without has_autoimmune
# rf_model_without = RandomForestClassifier(
#     n_estimators=200, 
#     max_depth=20, 
#     class_weight='balanced',
#     n_jobs=-1, 
#     random_state=42
# )
# rf_model_without.fit(X_train_no_ai, y_train)

# y_pred_prob_rf_without = rf_model_without.predict_proba(X_test_no_ai)[:, 1]
# auc_rf_without = roc_auc_score(y_test, y_pred_prob_rf_without)

# print(f"\nRandom Forest AUC (without has_autoimmune): {auc_rf_without:.4f}")
# print(f"AUC difference: {auc_rf_with - auc_rf_without:.4f}")

# # train XGBoost without has_autoimmune
# xgb_model_without = XGBClassifier(
#     n_estimators=200,
#     max_depth=6,
#     learning_rate=0.05,
#     scale_pos_weight=scale_pos_weight,
#     random_state=42,
#     n_jobs=-1,
#     eval_metric='auc'
# )
# xgb_model_without.fit(X_train_no_ai, y_train)

# y_pred_prob_xgb_without = xgb_model_without.predict_proba(X_test_no_ai)[:, 1]
# auc_xgb_without = roc_auc_score(y_test, y_pred_prob_xgb_without)

# print(f"XGBoost AUC (without has_autoimmune): {auc_xgb_without:.4f}")
# print(f"AUC difference: {auc_xgb_with - auc_xgb_without:.4f}")

# # feature importance without has_autoimmune
# rf_importance_without = pd.DataFrame({
#     'feature': X_train_no_ai.columns,
#     'importance': rf_model_without.feature_importances_
# }).sort_values('importance', ascending=False)

# xgb_importance_without = pd.DataFrame({
#     'feature': X_train_no_ai.columns,
#     'importance': xgb_model_without.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\n" + "="*60)
# print("Top 20 Random Forest features (WITHOUT has_autoimmune):")
# print("="*60)
# print(rf_importance_without.head(20).to_string(index=False))

# print("\n" + "="*60)
# print("Top 20 XGBoost features (WITHOUT has_autoimmune):")
# print("="*60)
# print(xgb_importance_without.head(20).to_string(index=False))




Removing 3 autoimmune condition columns

Feature matrix shape: (359391, 129)
Target distribution:
autoimmune_label
0    333007
1     26384
Name: count, dtype: int64
Positive rate: 7.34%

Encoding 4 categorical columns: ['sex', 'country', 'max_symptom_value', 'has_autoimmune']

After encoding - Train shape: (250559, 5248), Test shape: (108832, 5248)
Scale pos weight: 12.09

TEST 2: WITHOUT has_autoimmune feature

Removing 2 has_autoimmune columns: ['has_autoimmune_False', 'has_autoimmune_True']
New shape - Train: (250559, 5246), Test: (108832, 5246)


In [5]:
# print numeric column names
numeric_cols = X_train_no_ai.select_dtypes(include=['number']).columns.tolist()
print("Numeric columns:", numeric_cols)

# print string column names
string_cols = X_train_no_ai.select_dtypes(include=['object']).columns.tolist()
print("String columns:", string_cols)

Numeric columns: ['age', 'num_conditions', 'avg_condition_value', 'max_condition_value', 'num_symptoms', 'avg_symptom_value', 'num_unique_treatments_x', 'ate_food', 'num_unique_tags_x', 'num_unique_symptoms', 'total_symptom_reports', 'avg_symptom_severity', 'max_symptom_severity', 'std_symptom_severity', 'has_fatigue', 'has_tired', 'has_exhaustion', 'has_weakness', 'has_joint_pain', 'has_joint_ache', 'has_arthritis', 'has_joint_swelling', 'has_muscle_ache', 'has_muscle_pain', 'has_myalgia', 'has_inflammation', 'has_swelling', 'has_swollen', 'has_rash', 'has_skin_rash', 'has_hives', 'has_fever', 'has_low_grade_fever', 'has_stiffness', 'has_stiff_joints', 'has_morning_stiffness', 'has_brain_fog', 'has_cognitive', 'has_memory', 'has_numbness', 'has_tingling', 'has_neuropathy', 'has_dry_eyes', 'has_dry_mouth', 'has_sensitivity', 'has_sun_sensitivity', 'has_photosensitivity', 'has_hair_loss', 'has_alopecia', 'has_weight_loss', 'has_weight_gain', 'has_night_sweats', 'num_autoimmune_symptoms'

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

string_cols = X_train_no_ai.select_dtypes(include=['object']).columns.tolist()
print("String columns:", string_cols)

pipeline = Pipeline([
    ('hgb', HistGradientBoostingClassifier(random_state=42, max_iter=200))
])

pipeline.fit(X_train_no_ai, y_train)
y_pred = pipeline.predict(X_test_no_ai)

String columns: []


In [9]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

y_pred_prob_without = pipeline.predict_proba(X_test_no_ai)[:, 1]
auc_without = roc_auc_score(y_test, y_pred_prob_without)
print("AUC without AI features:", auc_without)

# y_prob = pipeline.predict_proba(X_test)[:, 1]
# auroc = roc_auc_score(y_test, y_prob)
# print("AUROC:", auroc)

AUC without AI features: 0.7797926682192018


In [None]:
# ============================================
# COMPARISON SUMMARY
# ============================================

# print("\n" + "="*60)
# print("SUMMARY: AUC COMPARISON")
# print("="*60)

# results_summary = pd.DataFrame({
#     'Model': ['Random Forest', 'XGBoost'],
#     'With has_autoimmune': [auc_rf_with, auc_xgb_with],
#     'Without has_autoimmune': [auc_rf_without, auc_xgb_without],
#     'Difference': [auc_rf_with - auc_rf_without, auc_xgb_with - auc_xgb_without]
# })

# print(results_summary.to_string(index=False))

# ============================================
# DETAILED METRICS FOR BEST MODEL
# ============================================

print("\n" + "="*60)
print("DETAILED METRICS (XGBoost without has_autoimmune)")
print("="*60)

y_pred_class = (y_pred_prob_xgb_without > 0.5).astype(int)
print(classification_report(y_test, y_pred_class, target_names=['No Autoimmune', 'Autoimmune']))

# ============================================
# INTERPRETATION GUIDE
# ============================================

print("\n" + "="*60)
print("INTERPRETATION GUIDE")
print("="*60)

if auc_xgb_without >= 0.85:
    print("EXCELLENT: AUC ≥ 0.85")
    print("   Model is strong even without condition features")
    print("   Can predict autoimmune from symptoms/medications alone.")
    
elif auc_xgb_without >= 0.75:
    print("GOOD: 0.75 ≤ AUC < 0.85")
    print("   Model is useful for clinical screening.")
    print("   Better than original 0.644 AUC!")
    
elif auc_xgb_without >= 0.65:
    print("FAIR: 0.65 ≤ AUC < 0.75")
    print("   Model has predictive value but needs improvement.")
    
else:
    print("POOR: AUC < 0.65")
    print("   has_autoimmune carrying most of the weight.")
    print("   Need significant feature engineering improvements.")

auc_drop = auc_xgb_with - auc_xgb_without
if auc_drop < 0.05:
    print(f"Small AUC drop ({auc_drop:.4f}): has_autoimmune wasn't critical")
elif auc_drop < 0.15:
    print(f"Moderate AUC drop ({auc_drop:.4f}): has_autoimmune helped but model is still strong")
else:
    print(f"Large AUC drop ({auc_drop:.4f}): has_autoimmune was very important")