In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("data/ckd.csv")

# Display first few rows
print(df.head())

# Check data types and missing values
print(df.info())
print(df.isnull().sum())


   id   age    bp     sg   al   su     rbc        pc         pcc          ba  \
0   0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2   2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3   3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4   4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

   ...  pcv    wc   rc  htn   dm  cad appet   pe  ane classification  
0  ...   44  7800  5.2  yes  yes   no  good   no   no            ckd  
1  ...   38  6000  NaN   no   no   no  good   no   no            ckd  
2  ...   31  7500  NaN   no  yes   no  poor   no  yes            ckd  
3  ...   32  6700  3.9  yes   no   no  poor  yes  yes            ckd  
4  ...   35  7300  4.6   no   no   no  good   no   no            ckd  

[5 rows x 26 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 39

In [2]:
# Fill missing numerical values with column mean
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())


In [3]:
# Fill missing categorical values with mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))


In [4]:
# Strip whitespace from classification labels
df["classification"] = df["classification"].str.strip()

# Replace text labels with numerical values
df["classification"] = df["classification"].replace({"ckd": 1, "notckd": 0, "ckd\t": 1})

  df["classification"] = df["classification"].replace({"ckd": 1, "notckd": 0, "ckd\t": 1})


In [5]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical columns to numeric
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [7]:
df.to_csv("data/cleaned_ckd.csv", index=False)
print("Data preprocessing completed and saved as cleaned_ckd.csv")


Data preprocessing completed and saved as cleaned_ckd.csv
