## Initial Model Training for Flaredown Dataset

In [12]:
import sys, os, platform
print("Python:", sys.version)
print("Platform:", platform.platform())
try:
    import numpy as np
    import pandas as pd 
    import sklearn
    print("NumPy:", np.__version__)
    print("pandas:", pd.__version__)
    print("scikit-learn:", sklearn.__version__)
except Exception as e:
    print("Import error:", e)

Python: 3.12.6 (main, Sep  6 2024, 19:03:47) [Clang 16.0.0 (clang-1600.0.26.3)]
Platform: macOS-15.5-arm64-arm-64bit
NumPy: 2.3.3
pandas: 2.3.3
scikit-learn: 1.7.2


In [13]:
data_dir = "data/processed/flaredown"

path = '../../data/processed/flaredown'
if not os.path.exists(path):
    raise FileNotFoundError(f"Path not found: {os.path.abspath(path)}")

conditions = pd.read_csv("../../data/processed/flaredown/conditions_grouped.csv")
feature_mat = pd.read_csv("../../data/processed/flaredown/feature_matrix.csv")
foods = pd.read_csv("../../data/processed/flaredown/foods_grouped.csv")
symptoms = pd.read_csv("../../data/processed/flaredown/symptoms_grouped.csv")
tags = pd.read_csv("../../data/processed/flaredown/tags_grouped.csv")
treatments = pd.read_csv("../../data/processed/flaredown/treatments_grouped.csv")

print("Conditions data shape:", conditions.shape)
print("Feature matrix shape:", feature_mat.shape)
print("Foods data shape:", foods.shape)
print("Symptoms data shape:", symptoms.shape)
print("Tags data shape:", tags.shape)
print("Treatments data shape:", treatments.shape)


Conditions data shape: (281482, 3)
Feature matrix shape: (42283, 19)
Foods data shape: (108807, 3)
Symptoms data shape: (309190, 3)
Tags data shape: (154712, 3)
Treatments data shape: (197552, 3)


In [None]:
import ast

def extract_condition_names(x):
    try:
        parsed = ast.literal_eval(x)  
        if isinstance(parsed, list):
            return [item[0].lower().strip() for item in parsed if isinstance(item, (list, tuple))]
        elif isinstance(parsed, tuple):
            return [parsed[0].lower().strip()]
    except Exception:
        return []
        
conditions['clean_conditions'] = conditions['conditions_observed'].apply(extract_condition_names)

unique_conditions = (
    conditions['clean_conditions']
    .explode()
    .dropna()
    .unique()
)
print(len(unique_conditions), "unique clean condition names found")
print(unique_conditions[:50])  

autoimmune_conditions = [
    "graves' disease",
    "rheumatoid arthritis",
    "inflammatory bowel disease (ibd)",
    "ulcerative colitis",
    "lupus",
    "systemic lupus erythematosus",
    "ankylosing spondylitis",
    "hashimoto's thyroiditis",
    "type 1 diabetes",
    "mast cell activation syndrome",
    "ehlers-danlos syndrome",
    "myalgic encephalomyelitis"
]

# X = df.drop(columns=['user_id', 'check-in_date','autoimmune_label'] + autoimmune_conditions)

9219 unique clean condition names found
["graves' disease" 'food allergies' "meniere's disease" 'anxiety'
 'hypermobility' 'fibromyalgia' 'leaky gut' 'migraine'
 'thoracic outlet syndrome' 'rheumatoid arthritis' 'dysautonomia'
 'postural orthostatic tachycardia syndrome (pots)' 'narcolepsy'
 'stomach upset' 'back pain' 'thirst' 'hunger' 'muscle soreness' 'fatigue'
 'pots' 'gastroparesis' 'inflammatory bowel disease (ibd)'
 'mast cell activation syndrome' 'ehlers-danlos syndrome'
 'myalgic encephalomyelitis' 'lupus' 'systemic lupus erythematosus'
 'stress' 'ankylosing spondylitis' 'stomach pain' 'type 1 diabetes'
 "hashimoto's thyroiditis" 'diabetes' 'polycystic ovary syndrome'
 'depression' 'joint pain' 'muscle aches' 'hypothyroidism'
 'traumatic brain injury' 'glaucoma' 'adult add' 'ulcerative colitis'
 'polycystic ovary syndrome (pcos)' 'lyme disease' 'me/cfs'
 'bi-lateral sciatica' 'tension headache' 'acid reflux' 'sinus'
 'neck pain']


In [15]:
# find the 100 most common symptoms in the symptoms dataframe
symptoms['clean_symptoms'] = symptoms['symptoms_observed'].apply(lambda x:
    [symptom[0].lower().strip() for symptom in ast.literal_eval(x)] if pd.notnull(x) else []
)

print(symptoms['clean_symptoms'].explode().value_counts().head(20))

clean_symptoms
headache                 108569
fatigue                  107634
nausea                    89564
brain fog                 73191
joint pain                64940
fatigue and tiredness     63395
anxiety                   61602
diarrhea                  52423
dizziness                 50151
depression                43422
constipation              40894
back pain                 37834
neck pain                 37769
abdominal pain            33275
migraine                  31960
muscle pain               29918
lower back pain           29193
stomach pain              27195
insomnia                  26832
bloating                  24002
Name: count, dtype: int64


In [16]:
top_100_symptoms = symptoms['clean_symptoms'].explode().value_counts().head(100).index.tolist()
def convert_symptoms_and_severity(symptoms, top_100_symptoms):
    # need to store the symptoms severity for the top 100 symptoms as well
    patient_symptoms = np.zeros(len(top_100_symptoms))

    for symp, sev in symptoms:
        if symp in top_100_symptoms:
            idx = top_100_symptoms.index(symp)
            try:
                patient_symptoms[idx] = float(sev)
            except ValueError:
                patient_symptoms[idx] = 0.0 

    return patient_symptoms

symptoms_sev_matrix = np.vstack(symptoms['symptoms_observed'].apply(
    lambda x: convert_symptoms_and_severity(ast.literal_eval(x), top_100_symptoms))
)

symptom_sev_df = pd.DataFrame(
    symptoms_sev_matrix,
    columns=top_100_symptoms
)

symptoms = pd.concat([symptoms[['user_id', 'checkin_date']], symptom_sev_df], axis=1)
symptoms.head(5)


Unnamed: 0,user_id,checkin_date,headache,fatigue,nausea,brain fog,joint pain,fatigue and tiredness,anxiety,diarrhea,...,unrefreshing sleep,panic attack,numbness,tension headache,muscle cramps,dry eyes,upset stomach,mouth sores,sleepiness,tiredness
0,QEVuQwEA++2fi8XAwh3BnSEwL172Dg==,2019-08-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,QEVuQwEA++2fi8XAwh3BnSEwL172Dg==,2019-08-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,QEVuQwEA++2fi8XAwh3BnSEwL172Dg==,2019-09-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,QEVuQwEA++PmSIrPm0/GE+l1QxEh1g==,2019-03-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,QEVuQwEA++PmSIrPm0/GE+l1QxEh1g==,2019-03-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0


# Need to merge the dataset so that model can be trained on it

In [17]:
merged = conditions.merge(feature_mat, on = ["user_id"], how = "left")
# merged = merged.merge(foods, on = ["user_id", "checkin_date"], how="left")
merged = merged.merge(symptoms, on = ["user_id", "checkin_date"], how="left")
# merged = merged.merge(tags, on = ["user_id", "checkin_date"], how="left")
# merged = merged.merge(treatments, on = ["user_id", "checkin_date"], how="left")

In [25]:
merged.head(10)
col_names = merged.columns

for col in col_names:
    print(col)

user_id
age
ate_food
num_symptoms
avg_symptom_value
max_symptom_value
num_conditions
avg_condition_value
max_condition_value
num_treatment_days
num_unique_treatments
num_tag_days
num_unique_tags
ulcerative colitis
rheumatoid arthritis
lupus
autoimmune_label
headache
fatigue
nausea
brain fog
joint pain
fatigue and tiredness
anxiety
diarrhea
dizziness
depression
constipation
back pain
neck pain
abdominal pain
migraine
muscle pain
lower back pain
stomach pain
insomnia
bloating
chest pain
hip pain
sore throat
shortness of breath
knee pain
shoulder pain
chronic pain
irritability
muscle weakness
lightheadedness
weakness
leg pain
vomiting
heart palpitations
muscle spasms
night sweats
light sensitivity
vertigo
hand pain
foot pain
stress
difficulty concentrating
joint stiffness
gas
headaches
tinnitus
menstrual cramps
pelvic pain
exhaustion
wrist pain
stomach cramps
upper back pain
acid reflux
nerve pain
jaw pain
mood swings
heartburn
chronic fatigue
tachycardia
fever
hot flashes
muscle twitchin

In [None]:
from sklearn.model_selection import GroupShuffleSplit
# merged.drop(['conditions_observed', 'clean_conditions','country','checkin_date'],inplace=True, axis=1)
# merged = pd.get_dummies(merged, columns=['sex'], drop_first=True) # one-hot encode sex

print(merged.columns)
label_columns = ['ulcerative colitis','rheumatoid arthritis','lupus','autoimmune_label'] + autoimmune_conditions
X = merged.drop(columns=label_columns, axis=1)
X = merged.drop(columns=['ulcerative colitis','rheumatoid arthritis','lupus','autoimmune_label'] + autoimmune_conditions, axis=1)
y = merged['autoimmune_label']
patient_group = merged['user_id']

gss = GroupShuffleSplit(n_splits = 5, test_size=0.3, random_state=123)
for i, (train_idx, test_idx) in enumerate(gss.split(X, y, groups=patient_group)):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

print(X.head())

# print numeric column names
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
print("Numeric columns:", numeric_cols)

# print string column names
string_cols = X.select_dtypes(include=['object']).columns.tolist()
print("String columns:", string_cols)


Index(['user_id', 'age', 'ate_food', 'num_symptoms', 'avg_symptom_value',
       'max_symptom_value', 'num_conditions', 'avg_condition_value',
       'max_condition_value', 'num_treatment_days',
       ...
       'muscle cramps', 'dry eyes', 'upset stomach', 'mouth sores',
       'sleepiness', 'tiredness', 'sex_Female', 'sex_Male', 'sex_Other',
       'sex_Unknown'],
      dtype='object', length=121)


KeyError: '["graves\' disease", \'inflammatory bowel disease (ibd)\', \'systemic lupus erythematosus\', \'ankylosing spondylitis\', "hashimoto\'s thyroiditis", \'type 1 diabetes\', \'mast cell activation syndrome\', \'ehlers-danlos syndrome\', \'myalgic encephalomyelitis\'] not found in axis'

In [None]:
#drop user_id from X_train and X_test
X_train = X_train.drop('user_id', axis=1)
X_test = X_test.drop('user_id', axis=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

string_cols = X.select_dtypes(include=['object']).columns.tolist()
print("String columns:", string_cols)

pipeline = Pipeline([
    ('hgb', HistGradientBoostingClassifier(random_state=42, max_iter=200))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

String columns: ['user_id']


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

y_prob = pipeline.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_prob)
print("AUROC:", auroc)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     76658
           1       0.36      0.01      0.02      7914

    accuracy                           0.91     84572
   macro avg       0.63      0.50      0.49     84572
weighted avg       0.86      0.91      0.86     84572

[[76508   150]
 [ 7831    83]]
Accuracy: 0.9056307051979379
AUROC: 0.5017056251201762


In [None]:
# implement k-nearest neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

knn_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])
knn_pipeline.fit(X_train, y_train)
y_pred_knn = knn_pipeline.predict(X_test)

y_prob = knn_pipeline.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, y_prob)
print("AUROC:", auroc)

AUROC: 0.5031870910047102
