In [65]:
import sys, os, platform
print("Python:", sys.version)
print("Platform:", platform.platform())
try:
    import numpy as np, pandas as pd, sklearn as sklearn
    print("NumPy:", np.__version__)
    print("pandas:", pd.__version__)
    print("scikit-learn:", sklearn.__version__)
except Exception as e:
    print("Import error:", e)

Python: 3.12.6 (main, Sep  6 2024, 19:03:47) [Clang 16.0.0 (clang-1600.0.26.3)]
Platform: macOS-15.5-arm64-arm-64bit
NumPy: 2.3.3
pandas: 2.3.3
scikit-learn: 1.7.2


In [66]:
import os, pandas as pd
DATA_ROOT = os.getenv("SYMSENSE_DATA", "../data/raw")
print("DATA_ROOT:", DATA_ROOT)
# Example: try reading a small Excel file if present
dataset = os.path.join(DATA_ROOT, "Rheumatic and Autoimmune Disease Dataset.xlsx")
if os.path.exists(dataset):
    df = pd.read_excel(dataset)
    display(df.head())
else:
    print("Place a small Excel file at:", dataset)

DATA_ROOT: ../data/raw


Unnamed: 0,Age,Gender,ESR,CRP,RF,Anti-CCP,HLA-B27,ANA,Anti-Ro,Anti-La,Anti-dsDNA,Anti-Sm,C3,C4,Disease
0,70,Male,39.0,18.6,34.2,29.9,Positive,Negative,Positive,Negative,Positive,Positive,,27.0,Rheumatoid Arthritis
1,39,Female,26.0,21.7,35.5,28.9,Negative,,Positive,,Positive,,100.0,66.0,Rheumatoid Arthritis
2,36,Female,41.0,15.6,21.3,21.3,Negative,Negative,,Positive,Negative,,158.0,12.0,Rheumatoid Arthritis
3,35,Male,43.0,23.4,26.0,39.0,,,Positive,Positive,,,119.0,41.0,Rheumatoid Arthritis
4,37,Female,30.0,,38.1,30.8,Positive,Negative,Positive,Negative,Positive,Negative,144.0,49.0,Rheumatoid Arthritis


In [67]:
df['Disease'].unique()
df['Age'].unique() # should I use ranges for the age? maybe in intervals of 5

array([70, 39, 36, 35, 37, 41, 32, 60, 51, 68, 24, 62, 57, 78, 43, 23, 20,
       30, 47, 46, 22, 44, 33, 73, 29, 45, 64, 67, 79, 61, 74, 54, 31, 72,
       71, 75, 27, 52, 53, 42, 58, 28, 50, 49, 76, 55, 40, 38, 25, 26, 65,
       77, 63, 66, 48, 21, 69, 80, 56, 34, 59])

In [68]:
nan_count = np.sum(df.isnull())
nan_count

  return reduction(axis=axis, out=out, **passkwargs)


Age              0
Gender           0
ESR           1088
CRP           2417
RF            1329
Anti-CCP      3263
HLA-B27       1934
ANA           3746
Anti-Ro       2900
Anti-La       3021
Anti-dsDNA    4713
Anti-Sm       5197
C3            1692
C4            2054
Disease          0
dtype: int64

In [69]:
# calculate the number of rows that have X missing features, where x goes from 1 to 15
# so first index is the number of rows with more than 1 column as null
missing_count = df.isnull().sum(axis=1)
num_missing = []
for num in range(1,16):
  num_missing.append((df[missing_count > num]).shape[0])

print("Number of rows with more than X missing features:", num_missing)

# Get the number of women in rows that have more than 6 missing features
women_with_over_6_missing = df[(missing_count > 6) & (df['Gender'] == 'Female')]
num_women_over_6_missing = women_with_over_6_missing.shape[0]
print(f"Number of women with more than 6 missing features: {num_women_over_6_missing}")

# Actual missing counts for these women
print(f"Missing feature counts for these women: {missing_count[women_with_over_6_missing.index].tolist()}")

Number of rows with more than X missing features: [9837, 6633, 3442, 1315, 396, 83, 14, 1, 0, 0, 0, 0, 0, 0, 0]
Number of women with more than 6 missing features: 38
Missing feature counts for these women: [7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 8, 7, 7, 8, 7, 7, 7, 7, 7]


Observation: 38 of the 83 people with 6 or more missing features are women.
<br> On Average, these women are missing 7-8 of the features

## One-Hot Encoding
This is done on the categorical variables like gender or labs that have positive and negative. Since missingess, is also a feature, it is also used. If someone is missing a value it will be noted.

In [70]:
to_encode = list(df.select_dtypes(include=['object']).columns)
to_encode.remove('Disease')  # don't want to one-hot encode the label
df[to_encode].nunique()

Gender        2
HLA-B27       2
ANA           2
Anti-Ro       2
Anti-La       2
Anti-dsDNA    2
Anti-Sm       2
dtype: int64

In [71]:
df_encoded = pd.get_dummies(df, columns=to_encode, drop_first=True, dummy_na=True)
print(df_encoded.shape)
print(df_encoded.columns)
df_encoded.head()

(12085, 22)
Index(['Age', 'ESR', 'CRP', 'RF', 'Anti-CCP', 'C3', 'C4', 'Disease',
       'Gender_Male', 'Gender_nan', 'HLA-B27_Positive', 'HLA-B27_nan',
       'ANA_Positive', 'ANA_nan', 'Anti-Ro_Positive', 'Anti-Ro_nan',
       'Anti-La_Positive', 'Anti-La_nan', 'Anti-dsDNA_Positive',
       'Anti-dsDNA_nan', 'Anti-Sm_Positive', 'Anti-Sm_nan'],
      dtype='object')


Unnamed: 0,Age,ESR,CRP,RF,Anti-CCP,C3,C4,Disease,Gender_Male,Gender_nan,...,ANA_Positive,ANA_nan,Anti-Ro_Positive,Anti-Ro_nan,Anti-La_Positive,Anti-La_nan,Anti-dsDNA_Positive,Anti-dsDNA_nan,Anti-Sm_Positive,Anti-Sm_nan
0,70,39.0,18.6,34.2,29.9,,27.0,Rheumatoid Arthritis,True,False,...,False,False,True,False,False,False,True,False,True,False
1,39,26.0,21.7,35.5,28.9,100.0,66.0,Rheumatoid Arthritis,False,False,...,False,True,True,False,False,True,True,False,False,True
2,36,41.0,15.6,21.3,21.3,158.0,12.0,Rheumatoid Arthritis,False,False,...,False,False,False,True,True,False,False,False,False,True
3,35,43.0,23.4,26.0,39.0,119.0,41.0,Rheumatoid Arthritis,True,False,...,False,True,True,False,True,False,False,True,False,True
4,37,30.0,,38.1,30.8,144.0,49.0,Rheumatoid Arthritis,False,False,...,False,False,True,False,False,False,True,False,False,False


In [72]:
#get the percentage of true in each nan column to determine if any can be dropped
num_rows = df_encoded.shape[0]
for col in df_encoded.columns:
    if col.endswith('_nan'):
        percent_missing = (df_encoded[col].sum() / num_rows) * 100
        print(f"{col}: {percent_missing:.2f}%")

# remove the column with 0% in the nan column because it's not useful
df_encoded = df_encoded.drop('Gender_nan', axis=1)


Gender_nan: 0.00%
HLA-B27_nan: 16.00%
ANA_nan: 31.00%
Anti-Ro_nan: 24.00%
Anti-La_nan: 25.00%
Anti-dsDNA_nan: 39.00%
Anti-Sm_nan: 43.00%


## Impute Numerical Data
use knn_impute so that imputing is more dependent on patient similarity

In [73]:
# Analyze numerical columns and their missing values
numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Missing value analysis for numerical data
print("\nMissing values in numerical features:")
for col in numerical_cols:
    missing_count = df_encoded[col].isnull().sum()
    percent_missing = (missing_count / len(df_encoded)) * 100
    print(f"{col}: {missing_count} missing ({percent_missing:.1f}%)")

# Check data types and ranges
print("\nNumerical data summary:")
df_encoded[numerical_cols].describe()


Missing values in numerical features:
Age: 0 missing (0.0%)
ESR: 1088 missing (9.0%)
CRP: 2417 missing (20.0%)
RF: 1329 missing (11.0%)
Anti-CCP: 3263 missing (27.0%)
C3: 1692 missing (14.0%)
C4: 2054 missing (17.0%)

Numerical data summary:


Unnamed: 0,Age,ESR,CRP,RF,Anti-CCP,C3,C4
count,12085.0,10997.0,9668.0,10756.0,8822.0,10393.0,10031.0
mean,49.905751,24.214331,13.32501,19.68518,19.579245,131.910324,38.178547
std,17.649957,14.368259,10.392953,11.517631,11.601712,36.425864,20.050244
min,20.0,0.0,0.1,0.0,0.0,50.0,5.0
25%,35.0,10.0,1.9,9.7,9.5,105.0,21.0
50%,50.0,28.0,15.6,19.2,19.0,133.0,38.0
75%,65.0,36.0,22.6,29.7,29.6,161.0,55.0
max,80.0,49.0,30.0,40.0,40.0,205.0,74.0


In [74]:
from sklearn.impute import KNNImputer;
from sklearn.preprocessing import StandardScaler;

df_knn = df_encoded.copy()

for col in numerical_cols:
    df_encoded[f'{col}_missing'] = df_encoded[col].isnull().astype(int)

# Standardize the numerical columns -> necessary for KNN
scaler = StandardScaler()
df_knn[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_knn[numerical_cols] = knn_imputer.fit_transform(df_knn[numerical_cols])

print(f"num missing in original dataset: {df_encoded[numerical_cols].isnull().sum().sum()}")
print(f"num missing after KNN imputation: {df_knn[numerical_cols].isnull().sum().sum()}")

num missing in original dataset: 11843
num missing after KNN imputation: 0


In [75]:
# join the df_knn to the df_encoded now that we know it was successful
# NOTE: the numerical features (lab numbers, age) have already been scaled in df_encoded
df_encoded.sort_index(axis=1, inplace=True)
df_knn.sort_index(axis=1, inplace=True)
df_encoded[numerical_cols] = df_knn[numerical_cols]

print(df_encoded.shape)
print(df_encoded.columns)

df_encoded.head()

(12085, 28)
Index(['ANA_Positive', 'ANA_nan', 'Age', 'Age_missing', 'Anti-CCP',
       'Anti-CCP_missing', 'Anti-La_Positive', 'Anti-La_nan',
       'Anti-Ro_Positive', 'Anti-Ro_nan', 'Anti-Sm_Positive', 'Anti-Sm_nan',
       'Anti-dsDNA_Positive', 'Anti-dsDNA_nan', 'C3', 'C3_missing', 'C4',
       'C4_missing', 'CRP', 'CRP_missing', 'Disease', 'ESR', 'ESR_missing',
       'Gender_Male', 'HLA-B27_Positive', 'HLA-B27_nan', 'RF', 'RF_missing'],
      dtype='object')


Unnamed: 0,ANA_Positive,ANA_nan,Age,Age_missing,Anti-CCP,Anti-CCP_missing,Anti-La_Positive,Anti-La_nan,Anti-Ro_Positive,Anti-Ro_nan,...,CRP,CRP_missing,Disease,ESR,ESR_missing,Gender_Male,HLA-B27_Positive,HLA-B27_nan,RF,RF_missing
0,False,False,1.138534,0,0.889639,0,False,False,True,False,...,0.507581,0,Rheumatoid Arthritis,1.029098,0,True,True,False,1.260285,0
1,False,True,-0.617917,0,0.80344,0,False,True,True,False,...,0.805875,0,Rheumatoid Arthritis,0.124284,0,False,False,False,1.37316,0
2,False,False,-0.787896,0,0.148327,0,True,False,False,True,...,0.218909,0,Rheumatoid Arthritis,1.1683,0,False,False,False,0.140211,0
3,False,True,-0.844555,0,1.674051,0,True,False,True,False,...,0.969456,0,Rheumatoid Arthritis,1.307502,0,True,False,True,0.5483,0
4,False,False,-0.731236,0,0.967219,0,False,False,True,False,...,0.261247,1,Rheumatoid Arthritis,0.402688,0,False,True,False,1.598912,0


In [76]:
# checks to see if the numerical values have been scaled properly
print(df_encoded[numerical_cols].mean().round(3))
print(df_encoded[numerical_cols].std().round(3))

num_rows = df_encoded.shape[0]
for col in df_encoded.columns:
    if col.endswith('_missing'):
        percent_missing = (df_encoded[col].sum() / num_rows) * 100
        print(f"{col}: {percent_missing:.2f}%")

df_encoded.drop('Age_missing', axis=1, inplace=True)  # drop age_missing since age has no missing values
df_encoded.shape

Age         0.000
ESR        -0.008
CRP         0.006
RF          0.008
Anti-CCP    0.001
C3         -0.000
C4         -0.008
dtype: float64
Age         1.000
ESR         0.974
CRP         0.954
RF          0.957
Anti-CCP    0.899
C3          0.948
C4          0.934
dtype: float64
Age_missing: 0.00%
Anti-CCP_missing: 27.00%
C3_missing: 14.00%
C4_missing: 17.00%
CRP_missing: 20.00%
ESR_missing: 9.00%
RF_missing: 11.00%


(12085, 27)

## Ensure all types are numeric

In [None]:
bool_vals = df_encoded.select_dtypes(include=['boolean']).nunique()

for col in bool_vals:
    df_encoded[col] = df_encoded[col].astype(int)

print(df_encoded.dtypes)

Series([], dtype: float64)
ANA_Positive             int64
ANA_nan                  int64
Age                    float64
Anti-CCP               float64
Anti-CCP_missing         int64
Anti-La_Positive         int64
Anti-La_nan              int64
Anti-Ro_Positive         int64
Anti-Ro_nan              int64
Anti-Sm_Positive         int64
Anti-Sm_nan              int64
Anti-dsDNA_Positive      int64
Anti-dsDNA_nan           int64
C3                     float64
C3_missing               int64
C4                     float64
C4_missing               int64
CRP                    float64
CRP_missing              int64
Disease                 object
ESR                    float64
ESR_missing              int64
Gender_Male              int64
HLA-B27_Positive         int64
HLA-B27_nan              int64
RF                     float64
RF_missing               int64
dtype: object


## Convert dataframe to csv

In [None]:
df_encoded.to_csv("../data/cleaned/cleaned_rheumatic_and_autoimmune_disease.csv", index=False)