In [1]:
!pip install tsfresh
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
import json

In [3]:
# load data
feature_matrix = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/feature_matrix_date.csv")

symptoms_grouped = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/symptoms_grouped.csv")
foods_grouped = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/foods_grouped.csv")
tags_grouped = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/tags_grouped.csv")
treatments_grouped = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/treatments_grouped.csv")
conditions_grouped=pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/conditions_grouped.csv")

import ast

def extract_condition_names(x):
    try:
        parsed = ast.literal_eval(x)  
        if isinstance(parsed, list):
            return [item[0].lower().strip() for item in parsed if isinstance(item, (list, tuple))]
        elif isinstance(parsed, tuple):
            return [parsed[0].lower().strip()]
    except Exception:
        return []
        
conditions_grouped['clean_conditions'] = conditions_grouped['conditions_observed'].apply(extract_condition_names)

unique_conditions = (
    conditions_grouped['clean_conditions']
    .explode()
    .dropna()
    .unique()
)
print(len(unique_conditions), "unique clean condition names found")
print(unique_conditions[:50])  

autoimmune_conditions = [
    "graves' disease",
    "rheumatoid arthritis",
    "inflammatory bowel disease (ibd)",
    "ulcerative colitis",
    "lupus",
    "systemic lupus erythematosus",
    "ankylosing spondylitis",
    "hashimoto's thyroiditis",
    "type 1 diabetes",
    "mast cell activation syndrome",
    "ehlers-danlos syndrome",
    "myalgic encephalomyelitis"
]


9219 unique clean condition names found
["graves' disease" 'food allergies' "meniere's disease" 'anxiety'
 'hypermobility' 'fibromyalgia' 'leaky gut' 'migraine'
 'thoracic outlet syndrome' 'rheumatoid arthritis' 'dysautonomia'
 'postural orthostatic tachycardia syndrome (pots)' 'narcolepsy'
 'stomach upset' 'back pain' 'thirst' 'hunger' 'muscle soreness' 'fatigue'
 'pots' 'gastroparesis' 'inflammatory bowel disease (ibd)'
 'mast cell activation syndrome' 'ehlers-danlos syndrome'
 'myalgic encephalomyelitis' 'lupus' 'systemic lupus erythematosus'
 'stress' 'ankylosing spondylitis' 'stomach pain' 'type 1 diabetes'
 "hashimoto's thyroiditis" 'diabetes' 'polycystic ovary syndrome'
 'depression' 'joint pain' 'muscle aches' 'hypothyroidism'
 'traumatic brain injury' 'glaucoma' 'adult add' 'ulcerative colitis'
 'polycystic ovary syndrome (pcos)' 'lyme disease' 'me/cfs'
 'bi-lateral sciatica' 'tension headache' 'acid reflux' 'sinus'
 'neck pain']


In [None]:
def has_autoimmune(conditions):
    if pd.isna(conditions):
        return 0
    for c in autoimmune_conditions:
        if c.lower() in conditions.lower():
            return 1
    return 0

conditions_grouped["autoimmune_label"] = conditions_grouped["conditions_observed"].apply(has_autoimmune)

Leakage-safe CV with TimeSeriesSplit → use feature_matrix_date.csv as your timestamp. Group by user_id. Save folds. 

Windowed feature extraction (tsfresh) → if you have repeated measurements per user (e.g., daily symptom or food entries), you can create rolling 90-day windows and extract features. 

Modeling → RF/XGB on tsfresh features, then optional TFT if you want sequence modeling. For TFT, you can just use symptoms, foods, tags, treatments as “observed reals” or categorical variables. 

Irregular features → for your dataset, you can apply masking/decay on symptom/food occurrence counts or flags. Labs won’t exist, but the same logic applies to other time-varying signals. 

Interpretability → SHAP on RF/XGB; attention + variable selection for TFT. 

In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report

# Load data
df = pd.read_csv("/Users/Andria/Desktop/symsense-mit-challenge-2025-Jupyter-first/data/processed/flaredown/feature_matrix_date.csv")

# Encode any object columns (except user_id / checkin_date / target)
X = df.drop(
    columns=['user_id', 'checkin_date', 'autoimmune_label'] + autoimmune_conditions,
    errors='ignore'
)
y = df['autoimmune_label']

groups = df['user_id']

# Patient-level train/test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# --- Encode categorical columns ---
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_test = pd.get_dummies(X_test, columns=categorical_cols)

# Align test columns with train (in case some categories are missing in test)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Evaluate AUC
y_pred_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)

print(f"AUC: {auc:.3f}")

AUC: 0.644
11    0
12    0
13    0
14    0
17    0
Name: autoimmune_label, dtype: int64


In [21]:
print(classification_report(y_test, y_pred_prob))

ValueError: Classification metrics can't handle a mix of binary and continuous targets