In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# **Dataset of Diabetes**

In [None]:
df_diabetes = pd.read_csv("Dataset of Diabetes .csv")
print(df_diabetes.head())

    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  


In [None]:
df_diabetes.isnull().sum()

Unnamed: 0,0
ID,0
No_Pation,0
Gender,0
AGE,0
Urea,0
Cr,0
HbA1c,0
Chol,0
TG,0
HDL,0


# **Data Cleaning**

In [None]:
import numpy as np

numeric_cols = df_diabetes.select_dtypes(include=np.number).columns
categorical_cols = df_diabetes.select_dtypes(include='object').columns


Handling Missing Values

In [None]:
df_diabetes[numeric_cols] = df_diabetes[numeric_cols].replace(0, np.nan)
df_diabetes[numeric_cols] = df_diabetes[numeric_cols].fillna(
    df_diabetes[numeric_cols].median()
)


Handle Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in categorical_cols:
    df_diabetes[col] = encoder.fit_transform(df_diabetes[col])


Handle Outliers (IQR Method)

In [None]:
Q1 = df_diabetes[numeric_cols].quantile(0.25)
Q3 = df_diabetes[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

df_diabetes = df_diabetes[~(
    (df_diabetes[numeric_cols] < (Q1 - 1.5 * IQR)) |
    (df_diabetes[numeric_cols] > (Q3 + 1.5 * IQR))
).any(axis=1)]


Min-Max Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
df_diabetes_minmax = pd.DataFrame(
    minmax.fit_transform(df_diabetes[numeric_cols]),
    columns=numeric_cols
)


Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()
df_diabetes_standard = pd.DataFrame(
    standard.fit_transform(df_diabetes[numeric_cols]),
    columns=numeric_cols
)


# **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler


# **Load Adult Income Dataset**

In [None]:
df_adult = pd.read_csv("adult.csv")
print(df_adult.head())
print(df_adult.columns)

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

Separate Numeric & Categorical Columns

In [None]:
numeric_cols = df_adult.select_dtypes(include=np.number).columns
categorical_cols = df_adult.select_dtypes(include='object').columns


Handling Missing Values

In [None]:
df_adult.replace("?", np.nan, inplace=True)


In [None]:
# Numeric → median
df_adult[numeric_cols] = df_adult[numeric_cols].fillna(
    df_adult[numeric_cols].median()
)

# Categorical → mode
for col in categorical_cols:
    df_adult[col] = df_adult[col].fillna(df_adult[col].mode()[0])


Handling Categorical Data (Label Encoding)

In [None]:
encoder = LabelEncoder()

for col in categorical_cols:
    df_adult[col] = encoder.fit_transform(df_adult[col])


Handling Outliers (IQR Method)

In [None]:
Q1 = df_adult[numeric_cols].quantile(0.25)
Q3 = df_adult[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

df_adult = df_adult[~(
    (df_adult[numeric_cols] < (Q1 - 1.5 * IQR)) |
    (df_adult[numeric_cols] > (Q3 + 1.5 * IQR))
).any(axis=1)]


Min-Max Scaling (Normalization)

In [None]:
minmax = MinMaxScaler()

df_adult_minmax = pd.DataFrame(
    minmax.fit_transform(df_adult[numeric_cols]),
    columns=numeric_cols
)


Standard Scaling

In [None]:
standard = StandardScaler()

df_adult_standard = pd.DataFrame(
    standard.fit_transform(df_adult[numeric_cols]),
    columns=numeric_cols
)
