In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Pre processing the Dataset
# Loading the raw dataset
df = pd.read_csv("../../data/raw/FC212039_Healthcare-Diabetes.csv").drop(columns=["Id"])

#checking for duplicates

df.loc[df.duplicated(keep='first')]

# Separating features -X and target -y
X = df.drop(columns="Outcome")

# Column names where 0 is invalid
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Replacing 0s with np.nan
for col in cols_with_zeros:
    X[col] = X[col].replace(0, np.nan)

#wide range differences in the data set:
#	Insulin: 0 – 846
#	Glucose: 0 – 199
#	BMI: 0 – 80.6
#	DiabetesPedigreeFunction: 0.078 – 2.42
# Therfore Scaling frist and then Imputation
# StandardScaler(z score) and KNN Imputation is selected (for LogisticRegression Model)
# creating a pipeline for  StandardScaler --> KNNImputer
prepdata_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("imputer", KNNImputer(n_neighbors=5))
])

# Fitting and transforming the features(X)
X_prep = prepdata_pipeline.fit_transform(X)

# Converting back to DataFrame with original column names
X_prep = pd.DataFrame(X_prep, columns=X.columns)

# Adding back the target column
X_prep["Outcome"] = df["Outcome"].values

# Displaying first few rows
print(X_prep.head())

# Saving to a new CSV file
X_prep.to_csv("../../data/processed/FC212039_Healthcare-Diabetes-Preprocessed.csv", index=False)

print("Preprocessed dataset saved as 'FC212039_Healthcare-Diabetes-Preprocessed.csv'")


   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.679232  0.853233      -0.032943       0.533241 -0.175091  0.140273   
1    -0.825341 -1.205921      -0.522090      -0.027046 -0.895302 -0.838377   
2     1.281062  1.997207      -0.685139       0.514564  1.162948 -1.299740   
3    -0.825341 -1.075181      -0.522090      -0.587333 -0.531666 -0.628666   
4    -1.126256  0.493698      -2.641726       0.533241  0.121466  1.468441   

   DiabetesPedigreeFunction       Age  Outcome  
0                  0.478509  1.432495        1  
1                 -0.369130 -0.181079        0  
2                  0.616712 -0.096154        1  
3                 -0.934224 -1.030329        0  
4                  5.579704 -0.011229        1  
Preprocessed dataset saved as 'FC212039_Healthcare-Diabetes-Preprocessed.csv'
