<a href="https://colab.research.google.com/github/Venkatesh3152/machine-learning/blob/main/ML_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv('/content/kidney_disease.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [4]:
dataset.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
dataset.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [6]:
dataset[["al"]].value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0.0,199
1.0,44
2.0,43
3.0,43
4.0,24
5.0,1


In [7]:
dataset[["al"]].isnull().sum()

Unnamed: 0,0
al,46


In [8]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["al"]])

In [9]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [10]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [11]:
data_imputed

array([[1.        ],
       [4.        ],
       [2.        ],
       [4.        ],
       [2.        ],
       [3.        ],
       [0.        ],
       [2.        ],
       [3.        ],
       [2.        ],
       [2.        ],
       [3.        ],
       [3.        ],
       [1.01694915],
       [3.        ],
       [3.        ],
       [2.        ],
       [1.01694915],
       [0.        ],
       [1.        ],
       [2.        ],
       [1.01694915],
       [4.        ],
       [0.        ],
       [4.        ],
       [0.        ],
       [0.        ],
       [3.        ],
       [1.        ],
       [1.        ],
       [1.01694915],
       [3.        ],
       [1.        ],
       [2.        ],
       [1.        ],
       [2.        ],
       [1.        ],
       [1.01694915],
       [3.        ],
       [2.        ],
       [2.        ],
       [0.        ],
       [0.        ],
       [1.        ],
       [3.        ],
       [3.        ],
       [0.        ],
       [3.   

In [12]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["al"])

In [13]:
imputed_df

Unnamed: 0,al
0,1.0
1,4.0
2,2.0
3,4.0
4,2.0
...,...
395,0.0
396,0.0
397,0.0
398,0.0


In [14]:
imputed_df.value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0.0,199
1.016949,46
1.0,44
2.0,43
3.0,43
4.0,24
5.0,1


In [15]:

import numpy as np

In [16]:
# Round & clip 'al'
dataset[["al"]] = np.clip(np.round(imputed_df), 0, 5).astype(int)

In [17]:
dataset[["al"]].value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0,199
1,90
2,43
3,43
4,24
5,1


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              400 non-null    int64  
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [19]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["age"]])

In [20]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [21]:

data_imputed

array([[-2.03139024e-01],
       [-2.59412412e+00],
       [ 6.13294910e-01],
       [-2.03139024e-01],
       [-2.81888954e-02],
       [ 4.96661491e-01],
       [ 9.63195168e-01],
       [-1.60274005e+00],
       [ 3.01278142e-02],
       [ 8.84445238e-02],
       [-8.65056050e-02],
       [ 6.71611620e-01],
       [ 9.63195168e-01],
       [ 9.63195168e-01],
       [ 9.63195168e-01],
       [-6.69672701e-01],
       [-2.61455734e-01],
       [-2.61455734e-01],
       [ 4.96661491e-01],
       [ 6.13294910e-01],
       [ 5.54978201e-01],
       [ 4.96661491e-01],
       [-2.03139024e-01],
       [-1.77769018e+00],
       [-5.53039282e-01],
       [ 5.54978201e-01],
       [ 1.37141214e+00],
       [ 1.02151188e+00],
       [ 1.37141214e+00],
       [ 9.63195168e-01],
       [ 9.99484667e-17],
       [ 1.25477872e+00],
       [ 5.54978201e-01],
       [ 4.96661491e-01],
       [ 1.07982859e+00],
       [ 7.88245039e-01],
       [ 1.42972884e+00],
       [ 1.19646201e+00],
       [ 1.0

In [22]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [23]:
data_imputed

array([[48.        ],
       [ 7.        ],
       [62.        ],
       [48.        ],
       [51.        ],
       [60.        ],
       [68.        ],
       [24.        ],
       [52.        ],
       [53.        ],
       [50.        ],
       [63.        ],
       [68.        ],
       [68.        ],
       [68.        ],
       [40.        ],
       [47.        ],
       [47.        ],
       [60.        ],
       [62.        ],
       [61.        ],
       [60.        ],
       [48.        ],
       [21.        ],
       [42.        ],
       [61.        ],
       [75.        ],
       [69.        ],
       [75.        ],
       [68.        ],
       [51.48337596],
       [73.        ],
       [61.        ],
       [60.        ],
       [70.        ],
       [65.        ],
       [76.        ],
       [72.        ],
       [69.        ],
       [82.        ],
       [46.        ],
       [45.        ],
       [47.        ],
       [35.        ],
       [54.        ],
       [54

In [24]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["age"])

In [25]:
imputed_df

Unnamed: 0,age
0,48.0
1,7.0
2,62.0
3,48.0
4,51.0
...,...
395,55.0
396,42.0
397,12.0
398,17.0


In [26]:
dataset[["age"]] =np.round(imputed_df).astype(int)

In [27]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80.0,1.020,1,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50.0,1.020,4,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80.0,1.010,2,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70.0,1.005,4,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80.0,1.010,2,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80.0,1.020,0,0.0,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70.0,1.025,0,0.0,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80.0,1.020,0,0.0,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60.0,1.025,0,0.0,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [28]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["bp"]])

In [29]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [30]:
data_imputed

array([[ 2.58373317e-01],
       [-1.93685691e+00],
       [ 2.58373317e-01],
       [-4.73370093e-01],
       [ 2.58373317e-01],
       [ 9.90116727e-01],
       [-4.73370093e-01],
       [-2.38068442e-16],
       [ 1.72186014e+00],
       [ 9.90116727e-01],
       [-1.20511350e+00],
       [-4.73370093e-01],
       [-4.73370093e-01],
       [-4.73370093e-01],
       [ 2.58373317e-01],
       [ 2.58373317e-01],
       [-4.73370093e-01],
       [ 2.58373317e-01],
       [ 1.72186014e+00],
       [-1.20511350e+00],
       [ 2.58373317e-01],
       [ 9.90116727e-01],
       [ 2.58373317e-01],
       [-4.73370093e-01],
       [ 1.72186014e+00],
       [-1.20511350e+00],
       [ 2.58373317e-01],
       [-4.73370093e-01],
       [-4.73370093e-01],
       [-4.73370093e-01],
       [-4.73370093e-01],
       [ 9.90116727e-01],
       [ 9.90116727e-01],
       [ 1.72186014e+00],
       [-4.73370093e-01],
       [ 9.90116727e-01],
       [-4.73370093e-01],
       [ 2.58373317e-01],
       [ 2.5

In [31]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [32]:
data_imputed

array([[ 80.        ],
       [ 50.        ],
       [ 80.        ],
       [ 70.        ],
       [ 80.        ],
       [ 90.        ],
       [ 70.        ],
       [ 76.46907216],
       [100.        ],
       [ 90.        ],
       [ 60.        ],
       [ 70.        ],
       [ 70.        ],
       [ 70.        ],
       [ 80.        ],
       [ 80.        ],
       [ 70.        ],
       [ 80.        ],
       [100.        ],
       [ 60.        ],
       [ 80.        ],
       [ 90.        ],
       [ 80.        ],
       [ 70.        ],
       [100.        ],
       [ 60.        ],
       [ 80.        ],
       [ 70.        ],
       [ 70.        ],
       [ 70.        ],
       [ 70.        ],
       [ 90.        ],
       [ 90.        ],
       [100.        ],
       [ 70.        ],
       [ 90.        ],
       [ 70.        ],
       [ 80.        ],
       [ 80.        ],
       [ 80.        ],
       [ 90.        ],
       [ 70.        ],
       [100.        ],
       [ 80

In [33]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["bp"])

In [34]:
imputed_df

Unnamed: 0,bp
0,80.0
1,50.0
2,80.0
3,70.0
4,80.0
...,...
395,80.0
396,70.0
397,80.0
398,60.0


In [35]:
dataset[["bp"]] =np.round(imputed_df).astype(int)

In [36]:
dataset[["bp"]].value_counts()

Unnamed: 0_level_0,count
bp,Unnamed: 1_level_1
80,116
70,112
60,71
90,53
100,25
76,12
50,5
110,3
120,1
140,1


In [37]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["sg"]])

In [38]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [39]:
data_imputed

array([[ 4.54070529e-01],
       [ 4.54070529e-01],
       [-1.29769883e+00],
       [-2.17358351e+00],
       [-1.29769883e+00],
       [-4.21814152e-01],
       [-1.29769883e+00],
       [-4.21814152e-01],
       [-4.21814152e-01],
       [ 4.54070529e-01],
       [-1.29769883e+00],
       [-1.29769883e+00],
       [-4.21814152e-01],
       [ 2.41544273e-15],
       [-1.29769883e+00],
       [-4.21814152e-01],
       [-4.21814152e-01],
       [ 2.41544273e-15],
       [ 1.32995521e+00],
       [-4.21814152e-01],
       [-4.21814152e-01],
       [ 2.41544273e-15],
       [ 1.32995521e+00],
       [-1.29769883e+00],
       [-4.21814152e-01],
       [ 1.32995521e+00],
       [-4.21814152e-01],
       [-1.29769883e+00],
       [ 2.41544273e-15],
       [-2.17358351e+00],
       [ 2.41544273e-15],
       [-4.21814152e-01],
       [-1.29769883e+00],
       [ 4.54070529e-01],
       [-1.29769883e+00],
       [ 4.54070529e-01],
       [-4.21814152e-01],
       [ 2.41544273e-15],
       [ 4.5

In [40]:
data_imputed = scaler.inverse_transform(data_imputed)

In [41]:
imputed_df

Unnamed: 0,bp
0,80.0
1,50.0
2,80.0
3,70.0
4,80.0
...,...
395,80.0
396,70.0
397,80.0
398,60.0


In [42]:
dataset[["sg"]].value_counts()

Unnamed: 0_level_0,count
sg,Unnamed: 1_level_1
1.02,106
1.01,84
1.025,81
1.015,75
1.005,7


In [43]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,0.0,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,0.0,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,0.0,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,0.0,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [44]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["su"]])

In [45]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [46]:
data_imputed

array([[-0.41010611],
       [-0.41010611],
       [ 2.32306944],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [ 3.23412796],
       [-0.41010611],
       [-0.41010611],
       [ 3.23412796],
       [-0.41010611],
       [ 0.5009524 ],
       [ 0.        ],
       [ 1.41201092],
       [-0.41010611],
       [-0.41010611],
       [ 0.        ],
       [ 2.32306944],
       [-0.41010611],
       [-0.41010611],
       [ 0.        ],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [ 3.23412796],
       [ 2.32306944],
       [-0.41010611],
       [ 0.        ],
       [-0.41010611],
       [ 0.5009524 ],
       [-0.41010611],
       [-0.41010611],
       [ 0.5009524 ],
       [-0.41010611],
       [ 0.        ],
       [-0.41010611],
       [ 1.41201092],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0.41010611],
       [-0

In [47]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [48]:
data_imputed

array([[0.        ],
       [0.        ],
       [3.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [4.        ],
       [0.        ],
       [0.        ],
       [4.        ],
       [0.        ],
       [1.        ],
       [0.45014245],
       [2.        ],
       [0.        ],
       [0.        ],
       [0.45014245],
       [3.        ],
       [0.        ],
       [0.        ],
       [0.45014245],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [4.        ],
       [3.        ],
       [0.        ],
       [0.45014245],
       [0.        ],
       [1.        ],
       [0.        ],
       [0.        ],
       [1.        ],
       [0.        ],
       [0.45014245],
       [0.        ],
       [2.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [49]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["su"])

In [50]:
imputed_df.value_counts()

Unnamed: 0_level_0,count
su,Unnamed: 1_level_1
0.0,290
0.450142,49
2.0,18
3.0,14
1.0,13
4.0,13
5.0,3


In [51]:
dataset[["su"]] =np.round(imputed_df).astype(int)

In [52]:
dataset[["su"]].value_counts()

Unnamed: 0_level_0,count
su,Unnamed: 1_level_1
0,339
2,18
3,14
1,13
4,13
5,3


In [53]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,3,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,0,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,0,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,0,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,0,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [55]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["bu"]])

In [56]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [57]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [58]:
data_imputed

array([[ 36.        ],
       [ 18.        ],
       [ 53.        ],
       [ 56.        ],
       [ 26.        ],
       [ 25.        ],
       [ 54.        ],
       [ 31.        ],
       [ 60.        ],
       [107.        ],
       [ 55.        ],
       [ 60.        ],
       [ 72.        ],
       [ 86.        ],
       [ 90.        ],
       [162.        ],
       [ 46.        ],
       [ 87.        ],
       [ 27.        ],
       [ 31.        ],
       [148.        ],
       [180.        ],
       [163.        ],
       [ 57.42572178],
       [ 50.        ],
       [ 75.        ],
       [ 45.        ],
       [ 87.        ],
       [ 31.        ],
       [ 28.        ],
       [155.        ],
       [ 33.        ],
       [ 39.        ],
       [ 55.        ],
       [153.        ],
       [ 39.        ],
       [ 29.        ],
       [ 65.        ],
       [103.        ],
       [ 70.        ],
       [ 80.        ],
       [ 20.        ],
       [ 29.        ],
       [202

In [59]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["bu"])

In [60]:
imputed_df.value_counts()

Unnamed: 0_level_0,count
bu,Unnamed: 1_level_1
57.425722,19
46.000000,15
25.000000,13
19.000000,11
40.000000,10
...,...
235.000000,1
241.000000,1
309.000000,1
322.000000,1


In [61]:
dataset[["bu"]] =np.round(imputed_df).astype(int)

In [62]:
dataset[["bu"]].value_counts()

Unnamed: 0_level_0,count
bu,Unnamed: 1_level_1
57,20
46,15
25,13
19,11
50,10
...,...
235,1
241,1
309,1
322,1


In [63]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,0,,normal,notpresent,notpresent,121.0,36,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,3,normal,normal,notpresent,notpresent,423.0,53,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117.0,56,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,0,normal,normal,notpresent,notpresent,106.0,26,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,0,normal,normal,notpresent,notpresent,140.0,49,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,0,normal,normal,notpresent,notpresent,75.0,31,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,0,normal,normal,notpresent,notpresent,100.0,26,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,0,normal,normal,notpresent,notpresent,114.0,50,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd
