# import libraries

In [61]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer, KNNImputer


# read dataset and clean it

In [3]:
df = pd.read_csv('diabetes.csv')
df.head(5).T

Unnamed: 0,0,1,2,3,4
Pregnancies,6.0,1.0,8.0,1.0,0.0
Glucose,148.0,85.0,183.0,89.0,137.0
BloodPressure,72.0,66.0,64.0,66.0,40.0
SkinThickness,35.0,29.0,0.0,23.0,35.0
Insulin,0.0,0.0,0.0,94.0,168.0
BMI,33.6,26.6,23.3,28.1,43.1
DiabetesPedigreeFunction,0.627,0.351,0.672,0.167,2.288
Age,50.0,31.0,32.0,21.0,33.0
Outcome,1.0,0.0,1.0,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


count number of occurance of $0$s in columns

In [27]:
missing_info = {}
missing_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in missing_columns:
    zero_count = (df[column] == 0).sum()
    missing_percentage = (zero_count / df[column].shape[0]) * 100
    info_dict = {}
    info_dict['missing_count'] = zero_count
    info_dict['missing_percentage'] = missing_percentage
    missing_info[column] = info_dict

print("missing data count and percentage:\n")
missing_df = pd.DataFrame(missing_info)
missing_df

missing data count and percentage:



Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI
missing_count,5.0,35.0,227.0,374.0,11.0
missing_percentage,0.651042,4.557292,29.557292,48.697917,1.432292


replace 0s with NaN

In [28]:
for column in missing_columns:
    df[column] = df[column].replace(to_replace=0, value=np.nan)

count nan in each column

In [32]:
df.isnull().sum()


Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

# Imputations

Strategy A

In [35]:
df_strategy_A = df.copy()

imputers = {}
for column in missing_columns:
    imputers[column] = SimpleImputer(strategy='mean')

imputers

{'Glucose': SimpleImputer(),
 'BloodPressure': SimpleImputer(),
 'SkinThickness': SimpleImputer(),
 'Insulin': SimpleImputer(),
 'BMI': SimpleImputer()}

In [37]:
for column, imputer in imputers.items():
    df_strategy_A[[column]] = imputer.fit_transform(df_strategy_A[[column]])

In [38]:
df_strategy_A.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [41]:
print(f"mean of Glucose before imputation: {imputers['Glucose'].statistics_}")
print(f"mean of Glucose before imputation: {df[['Glucose']].mean()}")
print(f"mean of Glucose after imputation: {df_strategy_A[['Glucose']].mean()}")

mean of Glucose before imputation: [121.68676278]
mean of Glucose before imputation: Glucose    121.686763
dtype: float64
mean of Glucose after imputation: Glucose    121.686763
dtype: float64


## Why the mean stays the same

Let a column have:

* `n` non-NaN values with mean **μ**
* `k` NaN values

### Before imputation

The mean is computed over non-NaN values only:
$$
\mu = \frac{\sum x_i}{n}
$$

### After `SimpleImputer(strategy='mean')`

* Each NaN is replaced with **μ**
* New total sum:
  $$
  \sum x_i + k\mu
  $$
* New number of values: `n + k`

New mean:
$$
\frac{\sum x_i + k\mu}{n+k}
= \frac{n\mu + k\mu}{n+k}
= \mu
$$


Strategy B

In [55]:
df_strategy_B = df.copy()

imputers = {}
for column in missing_columns:
    imputers[column] = {'Outcome_0': SimpleImputer(strategy='mean'), 'Outcome_1': SimpleImputer(strategy='mean')}

imputers

{'Glucose': {'Outcome_0': SimpleImputer(), 'Outcome_1': SimpleImputer()},
 'BloodPressure': {'Outcome_0': SimpleImputer(), 'Outcome_1': SimpleImputer()},
 'SkinThickness': {'Outcome_0': SimpleImputer(), 'Outcome_1': SimpleImputer()},
 'Insulin': {'Outcome_0': SimpleImputer(), 'Outcome_1': SimpleImputer()},
 'BMI': {'Outcome_0': SimpleImputer(), 'Outcome_1': SimpleImputer()}}

In [56]:
outcome_0_mask = df_strategy_B['Outcome'] == 0
outcome_1_mask = df_strategy_B['Outcome'] == 1

for col in missing_columns:   
    # Impute for Outcome == 0
    df_strategy_B.loc[outcome_0_mask, col] = imputers[col]['Outcome_0']\
        .fit_transform(df_strategy_B.loc[outcome_0_mask, [col]])
    
    # Impute for Outcome == 1
    df_strategy_B.loc[outcome_1_mask, col] = imputers[col]['Outcome_1']\
        .fit_transform(df_strategy_B.loc[outcome_1_mask, [col]])

In [60]:
print(f"mean of Glucose for Outcome 0 before imputation: {imputers['Glucose']['Outcome_0'].statistics_}")
print(f"mean of Glucose for Outcome 1 before imputation: {imputers['Glucose']['Outcome_1'].statistics_}")
print(f"mean of Glucose after imputation: {df_strategy_B.loc[outcome_0_mask, ['Glucose']].mean()}")
print(f"mean of Glucose after imputation: {df_strategy_B.loc[outcome_1_mask, ['Glucose']].mean()}")

mean of Glucose for Outcome 0 before imputation: [110.64386318]
mean of Glucose for Outcome 1 before imputation: [142.31954887]
mean of Glucose after imputation: Glucose    110.643863
dtype: float64
mean of Glucose after imputation: Glucose    142.319549
dtype: float64


Strategy C

In [63]:
df_strategy_C = df.copy()

strategy_C_imputers = {}
for column in missing_columns:
    strategy_C_imputers[column] = KNNImputer(n_neighbors=5)

strategy_C_imputers

{'Glucose': KNNImputer(),
 'BloodPressure': KNNImputer(),
 'SkinThickness': KNNImputer(),
 'Insulin': KNNImputer(),
 'BMI': KNNImputer()}

In [66]:
for column, imputer in strategy_C_imputers.items():
    df_strategy_C[[column]] = imputer.fit_transform(df_strategy_C[[column]])

In [67]:
df_strategy_C.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Sure! Let’s break **KNN Imputation** down clearly. It’s a little different from mean/median imputation because it **uses other rows to estimate missing values**.

---

## 1️⃣ Core Idea

**KNN imputation** fills a missing value by looking at the **most similar rows** (neighbors) and averaging their values.

* "Similarity" = usually **Euclidean distance** (for numeric columns)
* Number of neighbors = **`n_neighbors` parameter** (default 5)

---

## 2️⃣ Step-by-Step Process

Let’s say a dataset:

| A   | B | C   |
| --- | - | --- |
| 1   | 5 | NaN |
| 2   | 6 | 3   |
| 1   | 4 | 2   |
| NaN | 5 | 1   |

We want to **impute `NaN` in row 1, column C**.

### Step 1: Find neighbors

* Compute distance **only on columns without NaN in the target row**
* Example: row 0 (`[1,5,NaN]`) → compare using columns `A` and `B` only
* Compute distance to other rows:

  * Row 1: `[2,6]` → distance = √((2-1)² + (6-5)²) = √2
  * Row 2: `[1,4]` → distance = √((1-1)² + (4-5)²) = 1
  * Row 3: `[NaN,5]` → cannot use A → may skip or use only B depending on settings

---

### Step 2: Select `k` nearest neighbors

* Suppose `n_neighbors = 2` → pick the 2 closest rows with non-missing C
* From above: rows 1 (`C=3`) and 2 (`C=2`)

---

### Step 3: Compute imputed value

* Default = **average of neighbors’ values** (weighted or unweighted)
  [
  C_{\text{imputed}} = \frac{3 + 2}{2} = 2.5
  ]

---

### Step 4: Replace missing value

* Row 0, column C → 2.5

---

## 3️⃣ Notes on sklearn implementation

```python
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3, weights='uniform')  # or 'distance'
X_imputed = imputer.fit_transform(X)
```

### Parameters:

| Parameter     | Meaning                                                               |
| ------------- | --------------------------------------------------------------------- |
| `n_neighbors` | Number of neighbors to consider                                       |
| `weights`     | `'uniform'` = simple mean, `'distance'` = closer neighbors count more |
| `metric`      | Distance metric, default `'nan_euclidean'`                            |

**`nan_euclidean`**:

* Computes distance ignoring columns where either row has NaN
* Ensures missing values don’t break distance calculation

---

## 4️⃣ Advantages

* Uses **correlation between columns** → more accurate than mean/median
* Can handle **mixed missing patterns**
* Can be **weighted** by distance

---

## 5️⃣ Disadvantages

* **Slow** on large datasets (needs distance calculation for each missing row)
* Works best for **numeric data**; categorical requires encoding
* Can be sensitive to **outliers**

---

## 6️⃣ Quick visual intuition

```
Row with NaN:  [ ?  ,  5 ,  ? ]
Neighbors:      [2, 6, 3], [1, 4, 2], [3,5,1]

Compute distance using available features → pick k nearest → fill missing with neighbor average
```


