In [2]:
import pandas as pd
df = pd.read_csv('Diabetes.csv')
df

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,200,454317,M,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y
996,671,876534,M,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y
997,669,87654,M,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y
998,99,24004,M,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y


In [12]:
missing_counts = df.isnull().sum()
missing_percentages = (df.isnull().sum() / len(df)) * 100

missing_info = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentages
})

print("Missing Values Information:")
print(missing_info[missing_info['Missing Count'] > 0])

Missing Values Information:
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []


In [4]:
print(df.dtypes)

ID             int64
No_Pation      int64
Gender        object
AGE            int64
Urea         float64
Cr             int64
HbA1c        float64
Chol         float64
TG           float64
HDL          float64
LDL          float64
VLDL         float64
BMI          float64
CLASS         object
dtype: object


In [5]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()


print("Categorical columns identified based on 'object' dtype:")
for col in categorical_cols:
    print(f"Column '{col}': {df[col].nunique()} unique values - {df[col].unique()}")

# Explicitly listing categorical columns after review
categorical_features = ['Gender', 'CLASS']
print(f"\nFinal list of categorical features: {categorical_features}")

Categorical columns identified based on 'object' dtype:
Column 'Gender': 3 unique values - ['F' 'M' 'f']
Column 'CLASS': 5 unique values - ['N' 'N ' 'P' 'Y' 'Y ']

Final list of categorical features: ['Gender', 'CLASS']


In [6]:
df['Gender'] = df['Gender'].str.upper()
df['CLASS'] = df['CLASS'].str.strip()

print("Cleaned 'Gender' unique values:", df['Gender'].unique())
print("Cleaned 'CLASS' unique values:", df['CLASS'].unique())

Cleaned 'Gender' unique values: ['F' 'M']
Cleaned 'CLASS' unique values: ['N' 'P' 'Y']


In [7]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Identify categorical columns to encode
categorical_cols_to_encode = ['Gender', 'CLASS']

# Apply one-hot encoding to these columns
encoded_features = encoder.fit_transform(df[categorical_cols_to_encode])

# Get feature names for the new columns
encoded_feature_names = encoder.get_feature_names_out(categorical_cols_to_encode)

# Create a new DataFrame from the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=df.index)

# Concatenate this new DataFrame with the original df DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original 'Gender' and 'CLASS' columns
df = df.drop(columns=categorical_cols_to_encode)

print("DataFrame after one-hot encoding and dropping original categorical columns:")
print(df.head())

DataFrame after one-hot encoding and dropping original categorical columns:
    ID  No_Pation  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  \
0  502      17975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
1  735      34221   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0   
2  420      47975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
3  680      87656   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
4  504      34223   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0   

   Gender_M  CLASS_P  CLASS_Y  
0       0.0      0.0      0.0  
1       1.0      0.0      0.0  
2       0.0      0.0      0.0  
3       0.0      0.0      0.0  
4       1.0      0.0      0.0  


In [8]:
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Exclude identifier columns and one-hot encoded columns
excluded_cols = ['ID', 'No_Pation', 'Gender_M', 'CLASS_P', 'CLASS_Y']

# Filter out the excluded columns from the numerical columns list
final_numerical_cols = [col for col in numerical_cols if col not in excluded_cols]

print("Numerical columns for outlier detection:")
print(final_numerical_cols)

Numerical columns for outlier detection:
['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']


In [10]:
outliers_summary = {}

for col in final_numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers_summary[col] = len(outliers)

print("Number of outliers per numerical column (using IQR method):")
for col, count in outliers_summary.items():
    print(f"Column '{col}': {count} outliers")

Number of outliers per numerical column (using IQR method):
Column 'AGE': 98 outliers
Column 'Urea': 65 outliers
Column 'Cr': 52 outliers
Column 'HbA1c': 6 outliers
Column 'Chol': 27 outliers
Column 'TG': 55 outliers
Column 'HDL': 50 outliers
Column 'LDL': 11 outliers
Column 'VLDL': 74 outliers
Column 'BMI': 3 outliers


In [11]:
for col in final_numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap outliers using the clip method
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print("Descriptive statistics for numerical columns after outlier capping:")
print(df[final_numerical_cols].describe())

Descriptive statistics for numerical columns after outlier capping:
               AGE         Urea           Cr        HbA1c         Chol  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean     53.986000     4.826843    62.345000     8.280960     4.843420   
std       7.363968     1.714231    20.297906     2.532224     1.210029   
min      39.000000     0.700000    10.500000     0.950000     1.600000   
25%      51.000000     3.700000    48.000000     6.500000     4.000000   
50%      55.000000     4.600000    60.000000     8.000000     4.800000   
75%      59.000000     5.700000    73.000000    10.200000     5.600000   
max      71.000000     8.700000   110.500000    15.750000     8.000000   

                TG          HDL          LDL        VLDL          BMI  
count  1000.000000  1000.000000  1000.000000  1000.00000  1000.000000  
mean      2.280610     1.142250     2.591640     1.14040    29.566770  
std       1.150887     0.348675     1.039511     