EDA (exploratory data analysis)
Summary
1. Checked for null values
2. Encoded categorical features
3. Removed identifier columns (Patient_ID) 
4. Train-only preprocessing (log1p+Robust for heavy tails, Standard for others)

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [None]:
DATA_ROOT = os.getenv("SYMSENSE_DATA", "/Users/hongmei_yu/symsense-mit-challenge-2025-Jupyter-first/symsense-mit-challenge-2025-Jupyter-first/data/raw")
df = pd.read_csv(os.path.join(DATA_ROOT, "Complete_Updated_Autoimmune_Disorder_Dataset2.csv"))
df.head()

In [None]:
df.info() #getting general info about dataset: 13812 entries for each column, 79 columns, float64(29), int64(48), object(2)

In [None]:
df.isnull().sum().sort_values(ascending=False) #checking for null values

In [None]:
#exploring categorical variables
#gender and diagnosis

print(df['Gender'].value_counts())

print(df['Diagnosis'].value_counts())

In [None]:
#encoding the gender and diagnosis columns
#encoding the gender column with one-hot encoding
#encoding the diagnosis column with labelencoder

df = pd.concat([df, pd.get_dummies(df['Gender'], prefix='Gender').astype(int)], axis=1)
df = df.drop('Gender', axis=1)  

diag = LabelEncoder()
df['Diagnosis'] = diag.fit_transform(df['Diagnosis'])
print(diag.classes_)

df.head()

In [None]:
#removing Patient_ID column
df = df.drop(columns=['Patient_ID'])
df.head()

In [None]:
#removing target from the df before scaling 
y = df["Diagnosis"]
X = df.drop(columns=["Diagnosis"])

X.head()

In [None]:
#apply log1p then RobustScaler heavy-tailed counts and acute-phase reactants:
log_robust_feats = [
    'CRP','ESR','WBC_Count','PLT_Count','Reticulocyte_Count',
    'Neutrophils','Lymphocytes','Monocytes','Eosinophils','Basophils',
    'MBL_Level','Esbach'
]

#apply standardscalar to the rest of the features 
std_feats = [
    'Age','Sickness_Duration_Months','RBC_Count','Hemoglobin','Hematocrit',
    'MCV','MCH','MCHC','RDW','MPV','C3','C4'
]

In [None]:
for col in log_robust_feats:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        df[col].hist(bins=50)
        plt.title(f"Histogram of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.show()

In [None]:
# Safe log1p: clip negatives to 0, then log1p
safe_log1p = FunctionTransformer(
    func=lambda X: np.log1p(np.clip(X, a_min=0, a_max=None)),
    feature_names_out='one-to-one'
)

log_robust_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('log1p',  safe_log1p),
    ('scale',  RobustScaler())
])

std_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ('log_robust', log_robust_pipe, log_robust_feats),
        ('std',        std_pipe,        std_feats),
    ],
    remainder='drop'  # or 'passthrough' for any extras
)

In [None]:
# Example full pipeline with a classifier:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = Pipeline(steps=[('prep', preprocess),
                     ('model', LogisticRegression(max_iter=2000))])
clf.fit(X_train, y_train)  # y_train = Diagnosis

clf.fit(X_train, y_train)                
print(clf.score(X_test, y_test))