***Data Preprocessing***

**Load the data**


In [1]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('/content/hospital_data.csv')

#print head of the data set
print ("\nHead of data set : ",data.head())


Head of data set :       Patient_ID  Age  Gender  HbA1c_Level Readmitted
0  DMC 992/2019   36  Female          6.5        Yes
1  DMC 392/2016   58    Male          7.9        Yes
2  DMC 905/2019   47    Male          7.2        Yes
3  DMC 587/2019   44    Male          6.3         No
4  DMC 611/2011   32    Male          8.5        Yes


**Cleaning:** *Remove any inconsistencies or errors in the data.*

Remove incorrect/incomplete data

In [2]:
# Drop any rows with missing data
data_cleaned = data.dropna()
print(data_cleaned)


      Patient_ID  Age  Gender  HbA1c_Level Readmitted
0   DMC 992/2019   36  Female          6.5        Yes
1   DMC 392/2016   58    Male          7.9        Yes
2   DMC 905/2019   47    Male          7.2        Yes
3   DMC 587/2019   44    Male          6.3         No
4   DMC 611/2011   32    Male          8.5        Yes
..           ...  ...     ...          ...        ...
80   ENC/1810118   44    Male          8.9         No
81   ENC/1925018   61  Female          8.4        Yes
82   DMC/1303115   44    Male          7.1        Yes
83   ENC/2015220   32    Male          6.7         No
84   DMC/2050918   27  Female          7.4        Yes

[85 rows x 5 columns]


Binning Smoothing by mean/median/boundary

In [3]:
#Binning Smoothing by mean/median/boundary (on HbA1c_Level)

data_cleaned = data_cleaned.copy()

#categorized the HbA1c_level as "Normal","Pre-diabetes", "Diabets"
bins = [0, 5.6, 6.5, float('inf')]
labels = ['Normal', 'Pre-diabetes', 'Diabetes']
data_cleaned['HbA1c_Level_Binned'] = pd.cut(data_cleaned['HbA1c_Level'], bins=bins, labels=labels)

# Convert categorical to numerical values
label_mapping = { 'Normal': 0, 'Pre-diabetes': 1, 'Diabetes': 2}
data_cleaned['HbA1c_Level_Binned_Numeric'] = data_cleaned['HbA1c_Level_Binned'].map(label_mapping)

# Drop HbA1c_Level_Binned column
data_cleaned.drop(columns=['HbA1c_Level_Binned'], inplace=True)

print(data_cleaned)

      Patient_ID  Age  Gender  HbA1c_Level Readmitted  \
0   DMC 992/2019   36  Female          6.5        Yes   
1   DMC 392/2016   58    Male          7.9        Yes   
2   DMC 905/2019   47    Male          7.2        Yes   
3   DMC 587/2019   44    Male          6.3         No   
4   DMC 611/2011   32    Male          8.5        Yes   
..           ...  ...     ...          ...        ...   
80   ENC/1810118   44    Male          8.9         No   
81   ENC/1925018   61  Female          8.4        Yes   
82   DMC/1303115   44    Male          7.1        Yes   
83   ENC/2015220   32    Male          6.7         No   
84   DMC/2050918   27  Female          7.4        Yes   

   HbA1c_Level_Binned_Numeric  
0                           1  
1                           2  
2                           2  
3                           1  
4                           2  
..                        ...  
80                          2  
81                          2  
82                         

Regression

In [4]:
from sklearn.linear_model import LinearRegression

#predict HbA1c_Level based on Age and Gender
data_reg = data_cleaned[['Age', 'Gender', 'HbA1c_Level']].copy()
data_reg['Gender_numeric'] = data_reg['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

X = data_reg[['Age', 'Gender_numeric']]
y = data_reg['HbA1c_Level']

#  regression model
model = LinearRegression().fit(X, y)

Clustering

In [5]:
from sklearn.cluster import KMeans

#clusters on Age and HbA1c_Level
kmeans = KMeans(n_clusters=2)
data_cleaned['Cluster'] = kmeans.fit_predict(data_cleaned[['Age', 'HbA1c_Level']])

**Handle missing values** (median, mean, and mode).

In [6]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data['HbA1c_Level'] = data['HbA1c_Level'].fillna(data['HbA1c_Level'].median())

print(data)

      Patient_ID  Age  Gender  HbA1c_Level Readmitted
0   DMC 992/2019   36  Female          6.5        Yes
1   DMC 392/2016   58    Male          7.9        Yes
2   DMC 905/2019   47    Male          7.2        Yes
3   DMC 587/2019   44    Male          6.3         No
4   DMC 611/2011   32    Male          8.5        Yes
..           ...  ...     ...          ...        ...
80   ENC/1810118   44    Male          8.9         No
81   ENC/1925018   61  Female          8.4        Yes
82   DMC/1303115   44    Male          7.1        Yes
83   ENC/2015220   32    Male          6.7         No
84   DMC/2050918   27  Female          7.4        Yes

[85 rows x 5 columns]


**Reduction**

Dimensionality reduction

In [7]:
from sklearn.decomposition import PCA

#reducing the dimensions of 'Age', 'HbA1c_Level'
pca = PCA(n_components=1)
data_cleaned['PCA_Age_HbA1c'] = pca.fit_transform(data_cleaned[['Age', 'HbA1c_Level']])

Attribute subset selection

In [8]:
#Attribute Subset Selection some columns

data_reduced = data_cleaned[['Age', 'Gender', 'HbA1c_Level','HbA1c_Level_Binned_Numeric', 'Readmitted', 'Cluster']]

Numerosity Reduction - Sampling or Modeling

In [9]:
#Data reduction
data_Redu= data_reduced.sample(frac=0.5, random_state=42)

print("Reduced Data ")
print(data_Redu)

Reduced Data 
    Age  Gender  HbA1c_Level HbA1c_Level_Binned_Numeric Readmitted  Cluster
78   46  Female         10.7                          2         No        0
0    36  Female          6.5                          1        Yes        1
68   52  Female         10.2                          2         No        0
22   50    Male          8.3                          2         No        0
12   63  Female          8.1                          2        Yes        0
82   44    Male          7.1                          2        Yes        1
10   28  Female          7.2                          2        Yes        1
18   27  Female          5.8                          1        Yes        1
4    32    Male          8.5                          2        Yes        1
66   62  Female          7.9                          2        Yes        0
40   32    Male          5.2                          0         No        1
47   53  Female          9.2                          2        Yes        

**Transformation:** Convert data into a suitable format for analysis.

Normalization

In [10]:
# Normalize 'Age' and 'HbA1c_Level'
data_Redu['Age_Normalized'] = (data_Redu['Age'] - data_Redu['Age'].min()) / (data_Redu['Age'].max() - data_Redu['Age'].min())
data_Redu['HbA1c_Level'] = (data_Redu['HbA1c_Level'] - data_Redu['HbA1c_Level'].min()) / (data_Redu['HbA1c_Level'].max() - data_Redu['HbA1c_Level'].min())

Feature selection and Feature Engineering

In [11]:
# Convert Gender to numeric
data_Redu['Gender_numeric'] = data_Redu['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

# Convert "Readmitted" column to numeric
data_Redu['Readmitted_numeric'] = data_Redu['Readmitted'].apply(lambda x: 1 if x == 'Yes' else 0)

data_Redu.drop(columns=['Gender', 'Readmitted'], inplace=True)

print(data_Redu)

    Age  HbA1c_Level HbA1c_Level_Binned_Numeric  Cluster  Age_Normalized  \
78   46     0.793893                          2        0        0.608696   
0    36     0.473282                          1        1        0.391304   
68   52     0.755725                          2        0        0.739130   
22   50     0.610687                          2        0        0.695652   
12   63     0.595420                          2        0        0.978261   
82   44     0.519084                          2        1        0.565217   
10   28     0.526718                          2        1        0.217391   
18   27     0.419847                          1        1        0.195652   
4    32     0.625954                          2        1        0.304348   
66   62     0.580153                          2        0        0.956522   
40   32     0.374046                          0        1        0.304348   
47   53     0.679389                          2        0        0.760870   
35   27     

Discretization

In [12]:
age_bins = [0, 30, 50, 100]
age_labels = ['Young', 'Middle-aged', 'Senior']
data_Redu['Age_Group'] = pd.cut(data_Redu['Age'], bins=age_bins, labels=age_labels)

# Age_Group to numeric values
age_group_mapping = {'Young': 0, 'Middle-aged': 1, 'Senior': 2}
data_Redu['Age_Group_Numeric'] = data_Redu['Age_Group'].map(age_group_mapping)

data_Redu.drop(columns=['Age_Group'], inplace=True)

print(data_Redu)

    Age  HbA1c_Level HbA1c_Level_Binned_Numeric  Cluster  Age_Normalized  \
78   46     0.793893                          2        0        0.608696   
0    36     0.473282                          1        1        0.391304   
68   52     0.755725                          2        0        0.739130   
22   50     0.610687                          2        0        0.695652   
12   63     0.595420                          2        0        0.978261   
82   44     0.519084                          2        1        0.565217   
10   28     0.526718                          2        1        0.217391   
18   27     0.419847                          1        1        0.195652   
4    32     0.625954                          2        1        0.304348   
66   62     0.580153                          2        0        0.956522   
40   32     0.374046                          0        1        0.304348   
47   53     0.679389                          2        0        0.760870   
35   27     

Create New csv file include preprocessed data

In [13]:
# Save the preprocessed data to a new CSV file
data_Redu.to_csv('preprocessed_hospital_data.csv', index=False)

print("Final Transformed Data saved to 'preprocessed_hospital_data.csv'.")
print(data_Redu.head())

Final Transformed Data saved to 'preprocessed_hospital_data.csv'.
    Age  HbA1c_Level HbA1c_Level_Binned_Numeric  Cluster  Age_Normalized  \
78   46     0.793893                          2        0        0.608696   
0    36     0.473282                          1        1        0.391304   
68   52     0.755725                          2        0        0.739130   
22   50     0.610687                          2        0        0.695652   
12   63     0.595420                          2        0        0.978261   

    Gender_numeric  Readmitted_numeric Age_Group_Numeric  
78               1                   0                 1  
0                1                   1                 1  
68               1                   0                 2  
22               0                   0                 1  
12               1                   1                 2  
