###Load and Inspect

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("diabetes.csv")
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Dataset summary:

*   Features (numeric): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age
*   Target: Outcome (0 = no diabetes, 1 = diabetes)

###Lab1.1 – Handle Missing Data

In [2]:
# Replace 0 with NaN in selected columns
cols_with_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
df[cols_with_missing] = df[cols_with_missing].replace(0, np.nan)

# Check missing counts
print(df.isna().sum())

# Handling the missing value
from sklearn.impute import SimpleImputer
cols_with_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
imputer = SimpleImputer(strategy="median")
df[cols_with_missing] = imputer.fit_transform(df[cols_with_missing])

print(df[cols_with_missing].isna().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64


###Lab1.2 – Binning

In [3]:
# Equal-frequency binning of Age into 4 groups
df["Age_bin"] = pd.qcut(df["Age"], q=4, labels=["Young","Adult","Middle","Senior"])

# Using equal-width binning
df["Glucose_bin"] = pd.cut(df["Glucose"], bins=4, labels=["Low","MedLow","MedHigh","High"])

df[["Age","Age_bin","Glucose","Glucose_bin"]].head(10)

Unnamed: 0,Age,Age_bin,Glucose,Glucose_bin
0,50,Senior,148.0,MedHigh
1,31,Middle,85.0,MedLow
2,32,Middle,183.0,High
3,21,Young,89.0,MedLow
4,33,Middle,137.0,MedHigh
5,30,Middle,116.0,MedLow
6,26,Adult,78.0,Low
7,29,Adult,115.0,MedLow
8,53,Senior,197.0,High
9,54,Senior,125.0,MedHigh


###Lab1.3 – Remove Outliers (IQR method)

In [4]:
# IQR outlier removal function
def remove_outliers_iqr(data, column, factor=1.5):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

# Columns to check for outliers
outlier_cols = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]

before = df.shape[0]

for col in outlier_cols:
    df = remove_outliers_iqr(df, col)
    after = df.shape[0]
    print(f"Removed {before - after} rows from {col} (current shape = {df.shape})")
    before = after

print("\n Outlier removal complete.")
print(f"Final dataset shape: {df.shape}")

Removed 0 rows from Glucose (current shape = (768, 11))
Removed 14 rows from BloodPressure (current shape = (754, 11))
Removed 85 rows from SkinThickness (current shape = (669, 11))
Removed 310 rows from Insulin (current shape = (359, 11))
Removed 5 rows from BMI (current shape = (354, 11))

 Outlier removal complete.
Final dataset shape: (354, 11)


###Lab1.4 – Normalization / Scaling

In [5]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(df.drop(columns=["Outcome","Age_bin","Glucose_bin"]))
scaled_df = pd.DataFrame(scaled, columns=[c+"_scaled" for c in df.drop(columns=["Outcome","Age_bin","Glucose_bin"]).columns])

df = pd.concat([df.reset_index(drop=True), scaled_df], axis=1)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age_bin,Glucose_bin,Pregnancies_scaled,Glucose_scaled,BloodPressure_scaled,SkinThickness_scaled,Insulin_scaled,BMI_scaled,DiabetesPedigreeFunction_scaled,Age_scaled
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1,Senior,MedHigh,0.428571,0.679739,0.428571,0.740741,0.0,0.518519,0.302479,0.568627
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0,Middle,MedLow,0.071429,0.267974,0.321429,0.518519,0.0,0.282828,0.150413,0.196078
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1,Middle,High,0.571429,0.908497,0.285714,0.518519,0.0,0.171717,0.327273,0.215686
3,5,116.0,74.0,29.0,125.0,25.6,0.201,30,0,Middle,MedLow,0.357143,0.470588,0.464286,0.518519,0.0,0.249158,0.067769,0.176471
4,10,115.0,72.0,29.0,125.0,35.3,0.134,29,0,Adult,MedLow,0.714286,0.464052,0.428571,0.518519,0.0,0.575758,0.030854,0.156863


###Save Preprocessed Dataset

In [6]:
df.to_csv("diabetes_preprocessed.csv", index=False)
from google.colab import files
files.download("diabetes_preprocessed.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>