EDA (exploratory data analysis)
Summary
1. Checked for null values
2. Encoded categorical features
3. Removed identifier columns (Patient_ID) 
4. Scaling continuous features (RBC_Count, Hemoglobin, WBC_Count, etc)

Future considerations for feature selection
1. Dropping features with low variance

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
DATA_ROOT = os.getenv("SYMSENSE_DATA", "/Users/hongmei_yu/symsense-mit-challenge-2025-Jupyter-first/symsense-mit-challenge-2025-Jupyter-first/data/raw")
df = pd.read_csv(os.path.join(DATA_ROOT, "Complete_Updated_Autoimmune_Disorder_Dataset2.csv"))
df.head()

In [None]:
df.info() #getting general info about dataset: 13812 entries for each column, 79 columns, float64(29), int64(48), object(2)

In [None]:
df.isnull().sum().sort_values(ascending=False) #checking for null values

In [None]:
#exploring categorical variables
#gender and diagnosis

print(df['Gender'].value_counts())

print(df['Diagnosis'].value_counts())

In [None]:
#encoding the gender and diagnosis columns
#encoding the gender column with one-hot encoding
#encoding the diagnosis column with labelencoder

df = pd.concat([df, pd.get_dummies(df['Gender'], prefix='Gender').astype(int)], axis=1)
df = df.drop('Gender', axis=1)  

diag = LabelEncoder()
df['Diagnosis'] = diag.fit_transform(df['Diagnosis'])
print(diag.classes_)

df.head()

In [None]:
#removing Patient_ID column
df = df.drop(columns=['Patient_ID'])
df.head()

In [None]:
#identifying continuous variables
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
binary_cols = [col for col in numeric_cols if df[col].nunique() == 2]
numeric_cols = [col for col in numeric_cols if df[col].nunique() > 2]

print(numeric_cols)

In [None]:
#scaling continuous variables

scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df_scaled.head()