In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler

In [3]:
df=pd.read_csv("diabetes_modified.csv")

#### Data dictionary for the Pima Indian Diabetes dataset:

| Features            | Description                                                                                            |
|----------------------------------------------|--------------------------------------------------------------------------------------------------------|
| Pregnancies                | Number of pregnancies                                                                                   |
| Glucose                    | Plasma glucose concentration a 2 hours in an oral glucose tolerance test                                |
| BloodPressure              | Diastolic blood pressure (mm Hg)                                                                       |
| SkinThickness              | Triceps skin fold thickness (mm)                                                                       |
| Insulin                    | 2-Hour serum insulin (mu U/ml)                                                                         |
| BMI                        | Body mass index (weight in kg/(height in m)^2)                                 |
|DiabetesPedigreeFunction   |  Measure used to assess the hereditary risk of diabetes based on family history. |
| Age                        | Age (years)                                                                                            |
| Outcome                    | Class indicating whether the individual has diabetes or not                  |



# overall picture


In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,Diabetic
1,1.0,85.0,66.0,29.0,0.0,26.6,,31.0,Non-Diabetic
2,8.0,183.0,64.0,0.0,0.0,23.3,,32.0,Diabetic
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,Non-Diabetic
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,Diabetic


In [5]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,Non-Diabetic
764,2.0,122.0,70.0,27.0,0.0,36.8,0.34,27.0,Non-Diabetic
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,Non-Diabetic
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,Diabetic
767,1.0,93.0,70.0,31.0,0.0,30.4,0.315,23.0,Non-Diabetic


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               762 non-null    float64
 1   Glucose                   751 non-null    float64
 2   BloodPressure             751 non-null    float64
 3   SkinThickness             753 non-null    float64
 4   Insulin                   765 non-null    float64
 5   BMI                       739 non-null    float64
 6   DiabetesPedigreeFunction  739 non-null    float64
 7   Age                       766 non-null    float64
 8   Outcome                   768 non-null    object 
dtypes: float64(8), object(1)
memory usage: 54.1+ KB


In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,762.0,751.0,751.0,753.0,765.0,739.0,739.0,766.0
mean,3.824147,120.719041,69.10253,20.540505,79.905882,32.032882,0.471766,33.227154
std,3.360596,31.958175,19.282846,15.912954,115.43134,7.901092,0.326533,11.755153
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.35,0.245,24.0
50%,3.0,117.0,72.0,23.0,29.0,32.3,0.375,29.0
75%,6.0,140.0,80.0,32.0,128.0,36.6,0.6215,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [8]:
df.index

RangeIndex(start=0, stop=768, step=1)

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
df.shape

(768, 9)

In [11]:
df.dtypes

Pregnancies                 float64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
Outcome                      object
dtype: object

In [12]:
df['Outcome'] = df['Outcome'].astype('category')
df.dtypes

Pregnancies                  float64
Glucose                      float64
BloodPressure                float64
SkinThickness                float64
Insulin                      float64
BMI                          float64
DiabetesPedigreeFunction     float64
Age                          float64
Outcome                     category
dtype: object

In [13]:
df.columns.values

array(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype=object)

In [14]:
df.sort_index(axis=1)

Unnamed: 0,Age,BMI,BloodPressure,DiabetesPedigreeFunction,Glucose,Insulin,Outcome,Pregnancies,SkinThickness
0,50.0,33.6,72.0,0.627,148.0,0.0,Diabetic,6.0,35.0
1,31.0,26.6,66.0,,85.0,0.0,Non-Diabetic,1.0,29.0
2,32.0,23.3,64.0,,183.0,0.0,Diabetic,8.0,0.0
3,21.0,28.1,66.0,0.167,89.0,94.0,Non-Diabetic,1.0,23.0
4,33.0,43.1,40.0,2.288,137.0,168.0,Diabetic,0.0,35.0
...,...,...,...,...,...,...,...,...,...
763,63.0,32.9,76.0,0.171,101.0,180.0,Non-Diabetic,10.0,48.0
764,27.0,36.8,70.0,0.340,122.0,0.0,Non-Diabetic,2.0,27.0
765,30.0,26.2,72.0,0.245,121.0,112.0,Non-Diabetic,5.0,23.0
766,47.0,30.1,60.0,0.349,126.0,0.0,Diabetic,1.0,0.0


In [15]:
df.iloc[0:2]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,Diabetic
1,1.0,85.0,66.0,29.0,0.0,26.6,,31.0,Non-Diabetic


In [16]:
df.iloc[0:1,1:3]

Unnamed: 0,Glucose,BloodPressure
0,148.0,72.0


In [17]:
df.isnull()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [18]:
df.isnull().any()

Pregnancies                  True
Glucose                      True
BloodPressure                True
SkinThickness                True
Insulin                      True
BMI                          True
DiabetesPedigreeFunction     True
Age                          True
Outcome                     False
dtype: bool

In [19]:
df.isnull().sum().sum()

118

In [20]:
df['Age'].unique()

array([50., 31., 32., 21., 33., 30., 26., 29., 53., 54., 34., 57., 59.,
       51., 27., 41., 43., 22., 38., 60., 28., 45., 35., 46., 56., 37.,
       48., 40., 25., 24., 58., 42., 44., 39., 36., 23., 61., 69., 62.,
       55., 65., 47., 52., 66., nan, 49., 63., 67., 72., 81., 64., 70.,
       68.])

### label encoding

In [21]:
def label_encoding(df, column_name):
    label_encoder = LabelEncoder()
    df[column_name + '_LabelEncoded'] = label_encoder.fit_transform(df[column_name])
    print("DataFrame after label encoding:")
    print(df.iloc[:2,8:])

def one_hot_encoding(df, column_name):
    one_hot_encoder = OneHotEncoder()
    one_hot_encoded = one_hot_encoder.fit_transform(df[[column_name]]).toarray()
    enc_df = pd.DataFrame(one_hot_encoded, columns=[column_name + '_' + str(i) for i in range(one_hot_encoded.shape[1])])
    df = pd.concat([df, enc_df], axis=1)
    print("\nDataFrame after one-hot encoding:")
    print(df.iloc[:2,8:])


def dummy_encoding(df, column_name):
    dummy_encoded = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df, dummy_encoded], axis=1)
    print("\nDataFrame after dummy encoding:")
    print(df.iloc[:2,8:])


# Apply label encoding
label_encoding(df, 'Outcome')

# Apply one-hot encoding
one_hot_encoding(df, 'Outcome')

# Apply dummy encoding
dummy_encoding(df, 'Outcome')

DataFrame after label encoding:
        Outcome  Outcome_LabelEncoded
0      Diabetic                     0
1  Non-Diabetic                     1

DataFrame after one-hot encoding:
        Outcome  Outcome_LabelEncoded  Outcome_0  Outcome_1
0      Diabetic                     0        1.0        0.0
1  Non-Diabetic                     1        0.0        1.0

DataFrame after dummy encoding:
        Outcome  Outcome_LabelEncoded  Outcome_Diabetic  Outcome_Non-Diabetic
0      Diabetic                     0              True                 False
1  Non-Diabetic                     1             False                  True


### Data Normalization

In [26]:
df_normalized = df.drop(columns=['Outcome']).copy()
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_normalized), columns=df_normalized.columns)
df_normalized.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome_LabelEncoded
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,0.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,,0.166667,1.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,,0.183333,0.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,1.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,0.0
