In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [4]:
PROJECT_ROOT = os.getcwd()  # current folder as project root
RAW_CSV = os.path.join(PROJECT_ROOT, "data", "raw", "dataset.csv")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)
PROCESSED_CSV = os.path.join(PROCESSED_DIR, "clean_sepsis.csv")

In [5]:
# Cell 3: Load the dataset
print("Loading raw dataset...")
df = pd.read_csv("D:/Mini Project 2025-26/Mini Project N.Sepsis/PROJECT_EXE/federated-sepsis/data/raw/Dataset.csv")
print("Original dataset shape:", df.shape)
df.head()


Loading raw dataset...
Original dataset shape: (1552210, 44)


Unnamed: 0.1,Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,0,,,,,,,,,...,,,68.54,0,,,-0.02,1,0,17072
1,1,1,65.0,100.0,,,72.0,,16.5,,...,,,68.54,0,,,-0.02,2,0,17072
2,2,2,78.0,100.0,,,42.5,,,,...,,,68.54,0,,,-0.02,3,0,17072
3,3,3,73.0,100.0,,,,,17.0,,...,,,68.54,0,,,-0.02,4,0,17072
4,4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,...,,330.0,68.54,0,,,-0.02,5,0,17072


In [6]:
# Cell 4: Keep only relevant columns
columns_to_keep = [
    # Vitals
    "HR","O2Sat","Temp","SBP","MAP","DBP","Resp","EtCO2",
    # Labs
    "BaseExcess","HCO3","FiO2","pH","PaCO2","SaO2","AST","BUN",
    "Alkalinephos","Calcium","Chloride","Creatinine","Bilirubin_direct",
    "Glucose","Lactate","Magnesium","Phosphate","Potassium","Bilirubin_total",
    "TroponinI","Hct","Hgb","PTT","WBC","Fibrinogen","Platelets",
    # Demographics
    "Age","Gender",
    # Target
    "SepsisLabel",
    # Patient ID (for federated split later)
    "Patient_ID"
]

df = df[columns_to_keep]
df.head()


Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Age,Gender,SepsisLabel,Patient_ID
0,,,,,,,,,,,...,,,,,,,68.54,0,0,17072
1,65.0,100.0,,,72.0,,16.5,,,,...,,,,,,,68.54,0,0,17072
2,78.0,100.0,,,42.5,,,,,,...,,,,,,,68.54,0,0,17072
3,73.0,100.0,,,,,17.0,,,,...,,,,,,,68.54,0,0,17072
4,70.0,100.0,,129.0,74.0,69.0,14.0,,,26.0,...,29.7,9.5,30.6,11.3,,330.0,68.54,0,0,17072


In [7]:
df


Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Age,Gender,SepsisLabel,Patient_ID
0,,,,,,,,,,,...,,,,,,,68.54,0,0,17072
1,65.0,100.0,,,72.0,,16.5,,,,...,,,,,,,68.54,0,0,17072
2,78.0,100.0,,,42.5,,,,,,...,,,,,,,68.54,0,0,17072
3,73.0,100.0,,,,,17.0,,,,...,,,,,,,68.54,0,0,17072
4,70.0,100.0,,129.0,74.0,69.0,14.0,,,26.0,...,29.7,9.5,30.6,11.3,,330.0,68.54,0,0,17072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,83.0,99.0,,121.0,77.0,54.0,22.0,,,,...,,,,,,,88.00,0,0,113911
1552206,80.0,92.0,,102.0,73.0,51.0,24.0,,,,...,,,,,,,88.00,0,0,113911
1552207,95.0,97.0,36.7,128.5,83.0,58.5,25.0,,,,...,,,,,,,88.00,0,0,113911
1552208,104.0,99.0,,127.0,85.0,59.0,24.0,,,,...,,,,,,,88.00,0,0,113911


In [8]:
df.columns

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'SepsisLabel',
       'Patient_ID'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets,Age,Gender,SepsisLabel,Patient_ID
count,1398811.0,1349474.0,525226.0,1325945.0,1358940.0,1065656.0,1313875.0,57636.0,84145.0,65028.0,...,137433.0,114591.0,45699.0,99447.0,10242.0,92209.0,1552210.0,1552210.0,1552210.0,1552210.0
mean,84.58144,97.19395,36.977228,123.7505,82.4001,63.83056,18.7265,32.957657,-0.689919,24.075481,...,30.794093,10.430833,41.231193,11.446405,287.385706,196.013911,62.00947,0.559269,0.01798468,59201.48
std,17.32524,2.936924,0.770014,23.23156,16.34175,13.95601,5.098194,7.951662,4.294297,4.376504,...,5.491749,1.968661,26.217669,7.731013,153.002908,103.635366,16.38622,0.4964749,0.1328956,50248.19
min,20.0,20.0,20.9,20.0,20.0,20.0,1.0,10.0,-32.0,0.0,...,5.5,2.2,12.5,0.1,34.0,1.0,14.0,0.0,0.0,1.0
25%,72.0,96.0,36.5,107.0,71.0,54.0,15.0,28.0,-3.0,22.0,...,27.0,9.1,27.8,7.6,184.0,126.0,51.68,0.0,0.0,9990.0
50%,83.5,98.0,37.0,121.0,80.0,62.0,18.0,33.0,0.0,24.0,...,30.3,10.3,32.4,10.3,250.0,181.0,64.0,1.0,0.0,19965.0
75%,95.5,99.5,37.5,138.0,92.0,72.0,21.5,38.0,1.0,26.8,...,34.1,11.7,42.8,13.8,349.0,244.0,74.0,1.0,0.0,109878.0
max,280.0,100.0,50.0,300.0,300.0,300.0,100.0,100.0,100.0,55.0,...,71.7,32.0,250.0,440.0,1760.0,2322.0,100.0,1.0,1.0,120000.0


In [10]:
df.isnull().sum()   

HR                   153399
O2Sat                202736
Temp                1026984
SBP                  226265
MAP                  193270
DBP                  486554
Resp                 238335
EtCO2               1494574
BaseExcess          1468065
HCO3                1487182
FiO2                1422845
pH                  1444637
PaCO2               1465909
SaO2                1498649
AST                 1527027
BUN                 1445642
Alkalinephos        1527269
Calcium             1460879
Chloride            1481744
Creatinine          1457594
Bilirubin_direct    1549220
Glucose             1286694
Lactate             1510764
Magnesium           1454259
Phosphate           1489909
Potassium           1407685
Bilirubin_total     1529069
TroponinI           1537429
Hct                 1414777
Hgb                 1437619
PTT                 1506511
WBC                 1452763
Fibrinogen          1541968
Platelets           1460001
Age                       0
Gender              

In [11]:
df.shape

(1552210, 38)

In [12]:
# Cell 4: Drop duplicates
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)


After dropping duplicates: (1458867, 38)


In [13]:
# Cell 6: Convert types (numeric)
for c in df.columns:
    # Attempt conversion; ignore errors
    df[c] = pd.to_numeric(df[c], errors="coerce")
print("After type conversion: \n", df.dtypes)  

After type conversion: 
 HR                  float64
O2Sat               float64
Temp                float64
SBP                 float64
MAP                 float64
DBP                 float64
Resp                float64
EtCO2               float64
BaseExcess          float64
HCO3                float64
FiO2                float64
pH                  float64
PaCO2               float64
SaO2                float64
AST                 float64
BUN                 float64
Alkalinephos        float64
Calcium             float64
Chloride            float64
Creatinine          float64
Bilirubin_direct    float64
Glucose             float64
Lactate             float64
Magnesium           float64
Phosphate           float64
Potassium           float64
Bilirubin_total     float64
TroponinI           float64
Hct                 float64
Hgb                 float64
PTT                 float64
WBC                 float64
Fibrinogen          float64
Platelets           float64
Age                 flo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.to_numeric(df[c], errors="coerce")


In [14]:
# Ensure df is not a view
df = df.copy()

# Fill missing numeric values with median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for c in numeric_cols:
    median = df[c].median()
    df.loc[:, c] = df[c].fillna(median)



In [15]:
for c in numeric_cols:
    lower = df[c].quantile(0.01)
    upper = df[c].quantile(0.99)
    df[c] = np.clip(df[c], lower, upper)

print("✅ Outliers clipped (1st–99th percentile).")


✅ Outliers clipped (1st–99th percentile).


In [16]:
# 2️⃣ Feature Scaling / Normalization
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("✅ Numeric features standardized (mean=0, std=1).")

✅ Numeric features standardized (mean=0, std=1).


In [17]:
# 3️⃣ Shuffle Dataset
df = shuffle(df, random_state=42).reset_index(drop=True)
print("✅ Dataset shuffled and index reset.")

✅ Dataset shuffled and index reset.


In [None]:
df.to_csv("federated-sepsis/data/processed/cleaned_dataset.csv", index=False)
print("📁 Saved preprocessed dataset to data/processed/cleaned_scaled_shuffled.csv")

KeyboardInterrupt: 

In [20]:
df.isna().sum()

HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
SepsisLabel         0
Patient_ID          0
dtype: int64