In [6]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
diabetes = pd.read_csv("diabetes.csv")

uploaded = files.upload()
adult = pd.read_csv("adult.csv")

print("Diabetes dataset shape:", diabetes.shape)
print("Adult dataset shape:", adult.shape)


Saving diabetes.csv to diabetes (2).csv


Saving adult.csv to adult.csv
Diabetes dataset shape: (1000, 14)
Adult dataset shape: (48842, 15)


In [7]:
import numpy as np
import pandas as pd

numeric_cols_diabetes = diabetes.select_dtypes(include=np.number).columns

for col in numeric_cols_diabetes:
    diabetes[col] = pd.to_numeric(diabetes[col], errors='coerce')

diabetes[numeric_cols_diabetes] = diabetes[numeric_cols_diabetes].fillna(diabetes[numeric_cols_diabetes].mean())

df_numeric = diabetes[numeric_cols_diabetes]
df_non_numeric = diabetes.drop(columns=numeric_cols_diabetes, errors='ignore')

Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_mask = ((df_numeric < lower_bound) | (df_numeric > upper_bound)).any(axis=1)

diabetes_filtered_numeric = df_numeric[~outlier_mask]
diabetes_filtered_non_numeric = df_non_numeric[~outlier_mask]

diabetes = pd.concat([diabetes_filtered_numeric, diabetes_filtered_non_numeric], axis=1)


print("Diabetes dataset after cleaning:", diabetes.shape)

adult.fillna("Unknown", inplace=True)

adult = pd.get_dummies(adult, drop_first=True)

print("Adult dataset after cleaning:", adult.shape)

Diabetes dataset after cleaning: (629, 14)
Adult dataset after cleaning: (48842, 101)


In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
import numpy as np

scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

diabetes_numeric_cols = diabetes.select_dtypes(include=np.number)

diabetes_minmax_scaled = scaler_minmax.fit_transform(diabetes_numeric_cols)
diabetes_standard_scaled = scaler_standard.fit_transform(diabetes_numeric_cols)

diabetes_minmax_df = pd.DataFrame(diabetes_minmax_scaled, columns=diabetes_numeric_cols.columns, index=diabetes_numeric_cols.index)
diabetes_standard_df = pd.DataFrame(diabetes_standard_scaled, columns=diabetes_numeric_cols.columns, index=diabetes_numeric_cols.index)

adult_numeric_cols = adult.select_dtypes(include=np.number)

adult_minmax_scaled = scaler_minmax.fit_transform(adult_numeric_cols)
adult_standard_scaled = scaler_standard.fit_transform(adult_numeric_cols)

adult_minmax_df = pd.DataFrame(adult_minmax_scaled, columns=adult_numeric_cols.columns, index=adult_numeric_cols.index)
adult_standard_df = pd.DataFrame(adult_standard_scaled, columns=adult_numeric_cols.columns, index=adult_numeric_cols.index)

print("Diabetes MinMax scaled:\n", diabetes_minmax_df.head())
print("Adult Standard scaled:\n", adult_standard_df.head())

Diabetes MinMax scaled:
           ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
5   0.791980   0.451603  0.193548  0.157895  0.023810  0.153846  0.152542   
6   0.901003   0.451616  0.354839  0.118421  0.333333  0.153846  0.271186   
7   0.525063   0.451643  0.290323  0.473684  0.297619  0.153846  0.152542   
13  0.162907   0.451736  0.193548  0.460526  0.380952  0.238462  0.372881   
14  0.501253   0.451749  0.354839  0.315789  0.202381  0.153846  0.338983   

          TG       HDL       LDL      VLDL   BMI  
5   0.148936  0.400000  0.183673  0.142857  0.10  
6   0.212766  0.333333  0.306122  0.238095  0.25  
7   0.106383  0.333333  0.204082  0.142857  0.25  
13  0.297872  0.533333  0.326531  0.333333  0.20  
14  0.255319  0.533333  0.326531  0.285714  0.25  
Adult Standard scaled:
         age    fnlwgt  educational-num  capital-gain  capital-loss  \
0 -0.995129  0.351675        -1.197259     -0.144804     -0.217127   
1 -0.046942 -0.945524        -0.419335    