# Diabetes Prediction with RandomForestClassifier

A machine learning model is requested to be developed that can predict whether individuals are diabetic based on the specified features. Prior to developing the model, the necessary steps of data analysis and feature engineering are expected to be carried out.

The dataset is a part of a large dataset maintained by the National Institute of Diabetes and Digestive and Kidney Diseases in the United States. It was used for a diabetes study conducted on Pima Indian women aged 21 and above, residing in Phoenix, the 5th largest city in the state of Arizona.

The target variable is defined as "outcome," where 1 indicates a positive result of the diabetes test and 0 indicates a negative result.

## Dataset Story

* 9 Variables 768 Observations 24 KB
* Pregnancies: Number of pregnancies
* Glucose: Plasma glucose concentration after a 2-hour oral glucose tolerance test
* Blood Pressure: Blood pressure (Diastolic pressure) (mm Hg)
* SkinThickness: Skin thickness
* Insulin: 2-hour serum insulin (mu U/ml)
* DiabetesPedigreeFunction: Function (Plasma glucose concentration after a 2-hour oral glucose tolerance test)
* BMI: Body mass index
* Age: Age (years)
* Outcome: Afflicted with the disease (1) or not (0)

In [1]:
# Libraries

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import plot
from ydata_profiling import ProfileReport
#pip install ydata-profiling
#import pandas_profiling as pp
#!pip install missingno
#import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
import pickle
from sklearn.pipeline import Pipeline


# Some configurations
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [2]:
df_= pd.read_csv("diabetes.csv")

In [3]:
df= df_.copy()

# 1. EDA- Explolary Data Analysis

In [4]:
# lets check to big picture and get to know datas.
def describe_data(df):
    print("###################### Columns ######################")
    print(df.columns)
    print("######################### Index ########################")
    print(df.index)
    print("######################## Shape #########################")
    print(df.shape)
    print("###################### First 5 Lines ###################")
    print(df.head())
    print("###################### Last 5 Lines ###################")
    print(df.tail())
    print("###################### Types ############################")
    print(df.dtypes)
    print("######################### Info #########################")
    print(df.info())
    print("######################### N/A ##########################")
    print(df.isnull().sum())
    print("######################### Quantiles  ######################")
    print(df.describe().T)

In [5]:
describe_data(df)

###################### Columns ######################
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object')
######################### Index ########################
RangeIndex(start=0, stop=2000, step=1)
######################## Shape #########################
(2000, 9)
###################### First 5 Lines ###################
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction  Age  Outcome
0            2      138             62             35        0 33.600                     0.127   47        0
1            0       84             82             31      125 38.200                     0.233   23        0
2            0      145              0              0        0 44.200                     0.630   31        0
3            0      135             68             42      250 42.300                     0.365   24        0
4            1      139     

In [6]:
# lets check to categorical and numerical features
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

In [7]:
cat_cols, num_cols, cat_but_car= grab_col_names(df)

Observations: 2000
Variables: 9
cat_cols: 1
num_cols: 8
cat_but_car: 0
num_but_cat: 1


In [8]:
num_cols

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [9]:
cat_cols

['Outcome']

In [10]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A

00%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 69.18it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [11]:
# lets check to numerical features
def num_summary(dataframe, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1]
    for col in num_cols:  # num_cols = grab_col_names(dataframe)["num_cols"]
        print("########## Summary Statistics of " + col + " ############")
        print(dataframe[col].describe(quantiles))

        if plot:
            sns.histplot(data=dataframe, x=col)
            plt.xlabel(col)
            plt.title("The distribution of " + col)
            plt.grid(True)
            plt.show(block=True)
            plt.title("The boxplot of " + col)
            sns.boxplot(x=df[col])
            plt.show(block=True)

In [12]:
num_summary(df, plot=True)

########## Summary Statistics of Pregnancies ############
count   2000.000
mean       3.704
std        3.306
min        0.000
5%         0.000
10%        0.000
20%        1.000
30%        1.000
40%        2.000
50%        3.000
60%        4.000
70%        5.000
80%        7.000
90%        9.000
95%       10.000
100%      17.000
max       17.000
Name: Pregnancies, dtype: float64


  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)


########## Summary Statistics of Glucose ############
count   2000.000
mean     121.183
std       32.069
min        0.000
5%        80.000
10%       85.000
20%       95.000
30%      102.000
40%      109.000
50%      117.000
60%      125.000
70%      136.000
80%      147.000
90%      168.000
95%      181.000
100%     199.000
max      199.000
Name: Glucose, dtype: float64
########## Summary Statistics of BloodPressure ############
count   2000.000
mean      69.145
std       19.188
min        0.000
5%        43.800
10%       54.000
20%       60.000
30%       64.000
40%       68.000
50%       72.000
60%       74.000
70%       78.000
80%       82.000
90%       88.000
95%       90.000
100%     122.000
max      122.000
Name: BloodPressure, dtype: float64


  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)


########## Summary Statistics of SkinThickness ############
count   2000.000
mean      20.935
std       16.103
min        0.000
5%         0.000
10%        0.000
20%        0.000
30%       11.000
40%       18.000
50%       23.000
60%       28.000
70%       31.000
80%       35.000
90%       40.000
95%       44.050
100%     110.000
max      110.000
Name: SkinThickness, dtype: float64
########## Summary Statistics of Insulin ############
count   2000.000
mean      80.254
std      111.181
min        0.000
5%         0.000
10%        0.000
20%        0.000
30%        0.000
40%        0.000
50%       40.000
60%       75.400
70%      110.000
80%      155.000
90%      210.000
95%      293.000
100%     744.000
max      744.000
Name: Insulin, dtype: float64


  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)


########## Summary Statistics of BMI ############
count   2000.000
mean      32.193
std        8.150
min        0.000
5%        21.800
10%       23.700
20%       26.000
30%       28.300
40%       30.100
50%       32.300
60%       33.800
70%       35.500
80%       38.100
90%       42.100
95%       45.010
100%      80.600
max       80.600
Name: BMI, dtype: float64
########## Summary Statistics of DiabetesPedigreeFunction ############
count   2000.000
mean       0.471
std        0.324
min        0.078
5%         0.141
10%        0.164
20%        0.222
30%        0.260
40%        0.304
50%        0.376
60%        0.455
70%        0.560
80%        0.687
90%        0.878
95%        1.136
100%       2.420
max        2.420
Name: DiabetesPedigreeFunction, dtype: float64


  plt.show(block=True)
  plt.show(block=True)
  plt.show(block=True)


########## Summary Statistics of Age ############
count   2000.000
mean      33.090
std       11.786
min       21.000
5%        21.000
10%       22.000
20%       23.000
30%       25.000
40%       26.000
50%       29.000
60%       33.000
70%       38.000
80%       42.000
90%       50.000
95%       58.000
100%      81.000
max       81.000
Name: Age, dtype: float64


  plt.show(block=True)


In [13]:
# lets check to categarical features
def cat_summary(dataframe, plot=False):
    for col in cat_cols:  # cat_cols = grab_col_names(dataframe)["cat_cols"]
        print("############## Frequency of Categorical Data ########################")
        print("The unique number of " + col + ": " + str(dataframe[col].nunique()))
        print(pd.DataFrame({col: dataframe[col].value_counts(),
                            "Ratio": 100* dataframe[col].value_counts() / len(dataframe)}))
        if plot: # plot is True (Default)
            if dataframe[col].dtypes == "bool":  # plot function not working when data type is bool
                dataframe[col] == dataframe[col].astype(int)
                sns.countplot(x=dataframe[col], data=dataframe)
                plt.show(block=True)
            else:
                sns.countplot(x=dataframe[col], data=dataframe)
                plt.show(block=True)

In [14]:
cat_summary(df,plot=True)

############## Frequency of Categorical Data ########################
The unique number of Outcome: 2
         Outcome  Ratio
Outcome                
0           1316 65.800
1            684 34.200


  plt.show(block=True)


In [15]:
# lets analyze the target variable
df.groupby("Outcome").agg(["mean", "count"])

Unnamed: 0_level_0,Pregnancies,Pregnancies,Glucose,Glucose,BloodPressure,BloodPressure,SkinThickness,SkinThickness,Insulin,Insulin,BMI,BMI,DiabetesPedigreeFunction,DiabetesPedigreeFunction,Age,Age
Unnamed: 0_level_1,mean,count,mean,count,mean,count,mean,count,mean,count,mean,count,mean,count,mean,count
Outcome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,3.726,1316,120.726,1316,69.17,1316,20.735,1316,79.024,1316,32.019,1316,0.468,1316,33.15,1316
1,3.659,684,122.06,684,69.098,684,21.32,684,82.62,684,32.528,684,0.476,684,32.975,684


In [16]:
# lets thereshold value check
def outlier_thresholds(dataframe,col_name, q1=0.10,q3=0.90):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquartile = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquartile
    low_limit = quartile1 - 1.5 * interquartile
    return low_limit, up_limit

In [17]:
for col in num_cols:
    print("####### " + col + " ########")
    print(outlier_thresholds(df, col))

####### Pregnancies ########
(-13.5, 22.5)
####### Glucose ########
(-39.5, 292.5)
####### BloodPressure ########
(3.0, 139.0)
####### SkinThickness ########
(-60.0, 100.0)
####### Insulin ########
(-315.0, 525.0)
####### BMI ########
(-3.900000000000002, 69.7)
####### DiabetesPedigreeFunction ########
(-0.9073000000000003, 1.9495000000000007)
####### Age ########
(-20.0, 92.0)


In [18]:
# lets check outliers in data
def check_outliers(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[col_name].lt(low_limit).any() | dataframe[col_name].gt(up_limit).any()

In [19]:
for col in num_cols:
    print(col, check_outliers(df, col))

Pregnancies False
Glucose False
BloodPressure True
SkinThickness True
Insulin True
BMI True
DiabetesPedigreeFunction True
Age False


In [20]:
# lets check if there are any missing values
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0] # Holds columns with missing values in the na_columns variable.
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False) # Sort to see columns with the most missing values first.
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False) # Calculate the ratio of missing values to total values.
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio']) # Concatenate the number of missing values and their ratio.
    print(missing_df, end="\n")
    if na_name: # If na_name is true, return the na_columns variable.
        return na_columns

In [21]:
missing_values_table(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


In [22]:
# lets check the correlation between features.
corr_matrix = df.corr()

np.tril(np.ones(corr_matrix.shape)).astype(bool)  #It is used to mark the lower triangular part of the correlation matrix. It takes the lower triangular part and sets the rest to zero.

df_lt = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))# While preserving the values in the triangular section, it sets the other parts as NaN (Not a Number). Thus, only the correlation values within the lower triangular section become df_lt

sns.heatmap(df_lt, cmap="coolwarm",annot=True, fmt=".2f", center=0)
corr_matrix

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.12,0.15,-0.063,-0.077,0.019,-0.025,0.539,-0.01
Glucose,0.12,1.0,0.138,0.062,0.32,0.227,0.123,0.254,0.02
BloodPressure,0.15,0.138,1.0,0.199,0.087,0.282,0.051,0.238,-0.002
SkinThickness,-0.063,0.062,0.199,1.0,0.449,0.394,0.178,-0.111,0.017
Insulin,-0.077,0.32,0.087,0.449,1.0,0.223,0.193,-0.086,0.015
BMI,0.019,0.227,0.282,0.394,0.223,1.0,0.126,0.039,0.03
DiabetesPedigreeFunction,-0.025,0.123,0.051,0.178,0.193,0.126,1.0,0.027,0.012
Age,0.539,0.254,0.238,-0.111,-0.086,0.039,0.027,1.0,-0.007
Outcome,-0.01,0.02,-0.002,0.017,0.015,0.03,0.012,-0.007,1.0


When we check the graph, we observe that the highest correlation is between the variables of pregnancy-age , glucose-outcome and insulin-skin thickness.

In [23]:
diabetic = df[df.Outcome == 1]
healthy = df[df.Outcome == 0]

plt.scatter(healthy.Age, healthy.Insulin, color="green", label="Healthy", alpha = 0.4)
plt.scatter(diabetic.Age, diabetic.Insulin, color="red", label="Diabetic", alpha = 0.4)
plt.xlabel("Age")
plt.ylabel("Insulin")
plt.legend()
plt.show()

  plt.show()


In [24]:
df= df_.copy()

# 2. Feature Engineering

In [25]:
df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [26]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,2000.0,3.704,3.306,0.0,1.0,3.0,6.0,17.0
Glucose,2000.0,121.183,32.069,0.0,99.0,117.0,141.0,199.0
BloodPressure,2000.0,69.145,19.188,0.0,63.5,72.0,80.0,122.0
SkinThickness,2000.0,20.935,16.103,0.0,0.0,23.0,32.0,110.0
Insulin,2000.0,80.254,111.181,0.0,0.0,40.0,130.0,744.0
BMI,2000.0,32.193,8.15,0.0,27.375,32.3,36.8,80.6
DiabetesPedigreeFunction,2000.0,0.471,0.324,0.078,0.244,0.376,0.624,2.42
Age,2000.0,33.09,11.786,21.0,24.0,29.0,40.0,81.0
Outcome,2000.0,0.342,0.474,0.0,0.0,0.0,1.0,1.0


## 2.1 Missing Values

On these columns, a value of zero does not make sense and thus indicates missing value.

Following features or variables have an invalid zero value:

* Glucose
* BloodPressure
* SkinThickness
* Insulin
* BMI

It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values


In [27]:
df[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = df[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.nan)


In [28]:
df.isnull().sum()

Pregnancies                   0
Glucose                      13
BloodPressure                90
SkinThickness               573
Insulin                     956
BMI                          28
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [29]:
# To fill these Nan values the data distribution needs to be understood
p= df.hist(figsize = (20,10))

In [30]:
missing_values_table(df)

               n_miss  ratio
Insulin           956 47.800
SkinThickness     573 28.650
BloodPressure      90  4.500
BMI                28  1.400
Glucose            13  0.650


In [31]:
# K-Nearest Neighbors (KNN) is a machine learning algorithm that classifies or predicts a new data point based on the majority of its nearest neighbors.
#fill with KNN Imputer method

#from sklearn.impute import KNNImputer
#imputer = KNNImputer(n_neighbors=5)
#columns_to_impute = ["Insulin", "SkinThickness"]
#df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])
#df.head()

# note: model performance dropped to 77 when I using knn method

In [32]:
# Let's fill in the missing values with the mean.

df[["Insulin",
    "SkinThickness", "Glucose","BloodPressure","BMI"]]= df.groupby("Outcome")[["Insulin",
                                                                               "SkinThickness",
                                                                               "Glucose","BloodPressure","BMI"]].transform(lambda x: x.fillna(x.mean()))

In [33]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [34]:
p= df.hist(figsize = (20,10))

In [35]:
for col in df.columns:
    print(col, check_outliers(df, col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness True
Insulin True
BMI True
DiabetesPedigreeFunction True
Age False
Outcome False


In [36]:
# lets check outliers with boxplot
p= df.boxplot(figsize = (20,10))

**There seem to be outliers in the data. We need to handle them without significantly altering the data structure**

In [37]:
columns_without_target= [col for col in df.columns if col not in "Outcome"]
columns_without_target

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [38]:
#def replace_with_threshold(dataframe, variable):
    #low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.5, q3=0.95)
     #dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
     #dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit

In [39]:
 #for col in columns_without_target:
     #replace_with_threshold(df, col)

# when ı used replace_with_threshold accurasy is 0,86

In [40]:
# after replace_with_threshold function check to outlier
 #for col in df.columns:
     #print(col, check_outliers(df, col))

In [41]:
low, up = outlier_thresholds(df,['Pregnancies',
                                 'Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])

df[(df[['Pregnancies',
        'Glucose','BloodPressure','SkinThickness','Insulin',
        'BMI','DiabetesPedigreeFunction','Age']]> up) | (df[['Pregnancies','Glucose','BloodPressure','SkinThickness',
                                                             'Insulin','BMI','DiabetesPedigreeFunction','Age']]< low )].shape

clf= LocalOutlierFactor(n_neighbors=5)  # n_neighborsdefault value 5
clf.fit_predict(df) # fit to method

df_scores = clf.negative_outlier_factor_
df_scores= -df_scores
df_scores[0:5]

array([1.03211871, 1.08410265, 0.99740412, 1.10127473, 0.99996808])

In [42]:
np.sort(df_scores)[0:5]

scores = pd.DataFrame(np.sort(df_scores))
plt.figure(figsize=(10, 3))  # Adjust the figure size as needed
plt.plot(scores, marker=".", linestyle="-")
plt.title("Distribution of Scores")
plt.xlabel("Index")
plt.ylabel("Score")
plt.xlim([0, 10])
plt.tight_layout()  # Improve spacing between plot elements
plt.show()
#provides the initial and rigid fracture threshold in the distribution.

  plt.show()


In [43]:
th= np.sort(df_scores)[3]
df[df_scores < th]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
675,6,195.0,70.0,29.091,151.157,30.9,0.328,31,0
1132,6,195.0,70.0,29.091,151.157,30.9,0.328,31,0


In [44]:
df.describe([0.01, 0.05, 0.75, 0.90, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,50%,75%,90%,99%,max
Pregnancies,2000.0,3.704,3.306,0.0,0.0,0.0,3.0,6.0,9.0,13.0,17.0
Glucose,2000.0,121.975,30.533,44.0,67.99,80.0,118.0,141.0,168.0,195.0,199.0
BloodPressure,2000.0,72.404,11.95,24.0,44.0,53.9,72.0,80.0,88.0,106.0,122.0
SkinThickness,2000.0,29.341,9.126,7.0,10.0,14.0,29.091,32.0,40.0,52.0,110.0
Insulin,2000.0,153.751,80.415,14.0,23.0,50.0,151.157,158.742,210.0,495.0,744.0
BMI,2000.0,32.651,7.19,18.2,19.5,22.2,32.4,36.8,42.1,52.9,80.6
DiabetesPedigreeFunction,2000.0,0.471,0.324,0.078,0.096,0.141,0.376,0.624,0.878,1.601,2.42
Age,2000.0,33.09,11.786,21.0,21.0,21.0,29.0,40.0,50.0,67.0,81.0
Outcome,2000.0,0.342,0.474,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [45]:
df[df_scores < th].index

df[df_scores < th].drop(axis=0, labels=df[df_scores < th].index)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138.0,62.0,35.0,151.157,33.6,0.127,47,0
1,0,84.0,82.0,31.0,125.0,38.2,0.233,23,0
2,0,145.0,72.359,29.091,151.157,44.2,0.63,31,0
3,0,135.0,68.0,42.0,250.0,42.3,0.365,24,0
4,1,139.0,62.0,41.0,480.0,40.7,0.536,21,0


In [46]:
for col in df.columns:
    print(col, check_outliers(df, col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness True
Insulin True
BMI True
DiabetesPedigreeFunction True
Age False
Outcome False


## 2.2 Create New Features

**What is the model's performance if we don't perform any preprocessing on the data, and let's identify the most important variables.**

In [47]:
dff = df_.copy()

dff.dropna(inplace=True)
#dff = pd.get_dummies(dff, columns=["Outcome"], drop_first=True)
y = dff["Outcome"]
X = dff.drop(["Outcome"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

# result: 0,77

#Features importance
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(15, 15))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                      ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')


plot_importance(rf_model, X_train)

# "We can use these best feature to create new features.
# Glucose, BMI, DiabetesPedigreeFunction

  plt.show()


In [48]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object')

**I interested in 3 variables that affect the model when we do nothing to the model**
* Glucose
* BMI
* DiabetesPedigreeFunction

In [49]:

df["NEW_GLUCOSEvsBMI"]= df["Glucose"] * df["BMI"]

In [50]:
df["NEW_GLUCOSEvsDPF"] = df["DiabetesPedigreeFunction"] * df["Glucose"]

In [51]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_GLUCOSEvsBMI,NEW_GLUCOSEvsDPF
0,2,138.0,62.0,35.0,151.157,33.6,0.127,47,0,4636.8,17.526
1,0,84.0,82.0,31.0,125.0,38.2,0.233,23,0,3208.8,19.572
2,0,145.0,72.359,29.091,151.157,44.2,0.63,31,0,6409.0,91.35
3,0,135.0,68.0,42.0,250.0,42.3,0.365,24,0,5710.5,49.275
4,1,139.0,62.0,41.0,480.0,40.7,0.536,21,0,5657.3,74.504


**Let's classify numerical features**

In [52]:
df["Age"].describe()

count   2000.000
mean      33.090
std       11.786
min       21.000
25%       24.000
50%       29.000
75%       40.000
max       81.000
Name: Age, dtype: float64

In [53]:
df.groupby("Outcome").agg({"Age" : ["mean", "count"]})

Unnamed: 0_level_0,Age,Age
Unnamed: 0_level_1,mean,count
Outcome,Unnamed: 1_level_2,Unnamed: 2_level_2
0,33.15,1316
1,32.975,684


In [54]:
df.loc[(df["Age"]< 25),"NEW_AGE_CAT" ]= "young"
df.loc[((df["Age"]>= 25 ) & (df["Age"]< 55)),"NEW_AGE_CAT" ]= "mature"
df.loc[(df["Age"]>= 55),"NEW_AGE_CAT" ] = "senior"

In [55]:
df.groupby("NEW_AGE_CAT").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
NEW_AGE_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
mature,0.345,1276
senior,0.326,141
young,0.34,583


In [56]:
df.loc[(df["BMI"]< 18.5 ),"NEW_BMI_CAT" ]= "underweight"
df.loc[((df["BMI"]>= 18.5 ) & (df["BMI"]< 24.9)),"NEW_BMI_CAT" ]= "normal"
df.loc[((df["BMI"]>= 24.9 ) & (df["BMI"]< 29.9)),"NEW_BMI_CAT" ]= "Fat"
df.loc[((df["BMI"]>= 24.9 ) & (df["BMI"]< 34.9)),"NEW_BMI_CAT" ]= "1st_obesity"
df.loc[(df["BMI"]>= 34.9),"NEW_BMI_CAT" ] = "1st_obesity"

In [57]:
df.groupby("NEW_BMI_CAT").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
NEW_BMI_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
1st_obesity,0.344,1737
normal,0.328,253
underweight,0.4,10


In [58]:
df.groupby("Outcome").agg({"Glucose" : ["mean", "count"]})

Unnamed: 0_level_0,Glucose,Glucose
Unnamed: 0_level_1,mean,count
Outcome,Unnamed: 1_level_2,Unnamed: 2_level_2
0,121.558,1316
1,122.778,684


In [59]:
df["Glucose"].describe()

count   2000.000
mean     121.975
std       30.533
min       44.000
25%       99.000
50%      118.000
75%      141.000
max      199.000
Name: Glucose, dtype: float64

In [60]:
df.loc[(df["Glucose"]< 70),"NEW_GLUCOSE_CAT" ]= "low"
df.loc[((df["Glucose"]>= 70 ) & (df["Glucose"]< 140)),"NEW_GLUCOSE_CAT" ]= "normal"
df.loc[(df["Glucose"]>= 140),"NEW_GLUCOSE_CAT" ] = "up"

In [61]:
df.groupby("NEW_GLUCOSE_CAT").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
NEW_GLUCOSE_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
low,0.296,27
normal,0.336,1451
up,0.36,522


In [62]:
df.loc[(df["Pregnancies"]== 0),"HAVE_CHILDREN" ]= "NO"
df.loc[(df["Pregnancies"] > 0),"HAVE_CHILDREN" ]= "YES"

In [63]:
df.groupby("HAVE_CHILDREN").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
HAVE_CHILDREN,Unnamed: 1_level_2,Unnamed: 2_level_2
NO,0.346,301
YES,0.341,1699


In [64]:
df["Insulin"].describe()

count   2000.000
mean     153.751
std       80.415
min       14.000
25%      120.000
50%      151.157
75%      158.742
max      744.000
Name: Insulin, dtype: float64

In [65]:
df.loc[(df["Insulin"]< 140),"NEW_INSULIN_CAT" ]= "nondiebet"
df.loc[((df["Insulin"]>= 140 ) & (df["Insulin"]< 199)),"NEW_INSULIN_CAT" ]= "prediyabet"
df.loc[(df["Insulin"]>= 199),"NEW_INSULIN_CAT" ] = "diabet"

In [66]:
df.groupby("NEW_INSULIN_CAT").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
NEW_INSULIN_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
diabet,0.37,235
nondiebet,0.332,579
prediyabet,0.341,1186


In [67]:
df["BloodPressure"].describe()

count   2000.000
mean      72.404
std       11.950
min       24.000
25%       64.000
50%       72.000
75%       80.000
max      122.000
Name: BloodPressure, dtype: float64

In [68]:
df.loc[(df["BloodPressure"]< 75 ),"NEW_BLOODPRESSURE_CAT" ]= "low"
df.loc[((df["BloodPressure"]>= 75 ) & (df["BloodPressure"]<= 125)),"NEW_BLOODPRESSURE_CAT" ]= "normal"
df.loc[(df["BloodPressure"]> 125),"NEW_BLOODPRESSURE_CAT" ] = "high"

In [69]:
df.groupby("NEW_BLOODPRESSURE_CAT").agg({"Outcome" : ["mean", "count"]})

Unnamed: 0_level_0,Outcome,Outcome
Unnamed: 0_level_1,mean,count
NEW_BLOODPRESSURE_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
low,0.345,1236
normal,0.338,764


## 2.3 Encoding

In [70]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_GLUCOSEvsBMI,NEW_GLUCOSEvsDPF,NEW_AGE_CAT,NEW_BMI_CAT,NEW_GLUCOSE_CAT,HAVE_CHILDREN,NEW_INSULIN_CAT,NEW_BLOODPRESSURE_CAT
0,2,138.0,62.0,35.0,151.157,33.6,0.127,47,0,4636.8,17.526,mature,1st_obesity,normal,YES,prediyabet,low
1,0,84.0,82.0,31.0,125.0,38.2,0.233,23,0,3208.8,19.572,young,1st_obesity,normal,NO,nondiebet,normal
2,0,145.0,72.359,29.091,151.157,44.2,0.63,31,0,6409.0,91.35,mature,1st_obesity,up,NO,prediyabet,low
3,0,135.0,68.0,42.0,250.0,42.3,0.365,24,0,5710.5,49.275,young,1st_obesity,normal,NO,diabet,low
4,1,139.0,62.0,41.0,480.0,40.7,0.536,21,0,5657.3,74.504,young,1st_obesity,normal,YES,diabet,low


In [71]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 2000
Variables: 17
cat_cols: 7
num_cols: 10
cat_but_car: 0
num_but_cat: 1


In [72]:
# # Label Encoder is a data transformation method that converts text-based or categorical features into numerical data by assigning different numerical labels such as 0, 1, 2 to each unique category.

# def label_encoder(dataframe, binary_col):
#     labelencoder= LabelEncoder()  # called label encoder
#     dataframe[binary_col]= labelencoder.fit_transform(dataframe[binary_col]) # fitting to label encoder
#     return dataframe

In [73]:
# for col in cat_cols:
#     label_encoder(df, col)

In [74]:
# # MinMax Scaler is a feature scaling method that rescales the data to a specific range (typically [0, 1]).

# def min_max_scaler(dataframe, num_cols):
#     for col in num_cols:
#         mms= MinMaxScaler()
#         df[col + "MIN_MAX_SCALER"]= mms.fit_transform(df[[col]])

# min_max_scaler(df,num_cols)


In [75]:
# # Robust Scaler is a feature scaling method that scales the data using the median and interquartile range, making it resistant to outliers.

# def robust_scaler(dataframe, num_col):
#     for col in num_cols:
#         rs= RobustScaler()
#         df[col]= rs.fit_transform(df[[col]])

# #robust_scaler(df, num_cols)
# # when ı used scae to Robust method model accuracy=0,87

In [76]:
# # Standart Scaler is a feature scaling method that scales the data to have a mean of 0 and a standard deviation of 1.

# def standrt_scaler(dataframe, num_cols):
#     for col in num_cols:
#         ss= StandardScaler()
#         df[col]= ss.fit_transform(df[[col]])

# #standrt_scaler(df, num_cols)
# # when ı used scae to standart scaler method model accuracy=0,87

**I tried 3 scaler methods, but nothing changed in the data**

(768 observation)

* 0.87--> Minmax scaler
* 0.87-->Robust scaler
* 0.87--> Standart scaler

When the same score is obtained for all different scaling methods (MinMax Scaler, Robust Scaler, and Standard Scaler), it is likely due to the similarity in data distribution and structural features. Such situations arise when the impact of the scaling process is limited or similar.

In [77]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NEW_GLUCOSEvsBMI,NEW_GLUCOSEvsDPF,NEW_AGE_CAT,NEW_BMI_CAT,NEW_GLUCOSE_CAT,HAVE_CHILDREN,NEW_INSULIN_CAT,NEW_BLOODPRESSURE_CAT
0,2,138.0,62.0,35.0,151.157,33.6,0.127,47,0,4636.8,17.526,mature,1st_obesity,normal,YES,prediyabet,low
1,0,84.0,82.0,31.0,125.0,38.2,0.233,23,0,3208.8,19.572,young,1st_obesity,normal,NO,nondiebet,normal
2,0,145.0,72.359,29.091,151.157,44.2,0.63,31,0,6409.0,91.35,mature,1st_obesity,up,NO,prediyabet,low
3,0,135.0,68.0,42.0,250.0,42.3,0.365,24,0,5710.5,49.275,young,1st_obesity,normal,NO,diabet,low
4,1,139.0,62.0,41.0,480.0,40.7,0.536,21,0,5657.3,74.504,young,1st_obesity,normal,YES,diabet,low


In [78]:
df.shape

(2000, 17)

## 3. Modelling with RandomForestClassifier

In [79]:
# y = df["Outcome"]
# X = df.drop(["Outcome"], axis=1)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)


In [80]:
# pipeline = Pipeline([
#     ('scaler', MinMaxScaler()),  # Or any other scaler you prefer
#     ('classifier', RandomForestClassifier(random_state=17))
# ])

In [81]:
# pipeline.fit(X_train, y_train)

In [82]:
#y_pred = pipeline.predict(X_test)
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Accuracy of the pipeline: {accuracy}")

In [83]:
# Save the pipeline to a pickle file
#filename = 'diabetes_prediction_pipeline.pkl'
#pickle.dump(pipeline, open(filename, 'wb'))

In [84]:
#import pandas as pd

# Load the trained pipeline
#filename = 'diabetes_prediction_pipeline.pkl'
#loaded_pipeline = pickle.load(open(filename, 'rb'))

In [None]:

df = df.apply(pd.to_numeric, errors="coerce")

# Exit early if still empty
if df.empty:
    raise ValueError(" Your dataset is empty after cleaning. Check your CSV content.")

# Separate features and target
y = df["Outcome"]
X = df.drop("Outcome", axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=17
)

# Preprocessing setup
num_cols = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[("scaler", RobustScaler(), num_cols)]
)

classifier = RandomForestClassifier(
    n_estimators=200, max_depth=6, random_state=17
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", classifier)
])

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f" Final Accuracy (RobustScaler): {accuracy:.4f}")

# Save pipeline
with open("diabetes_prediction_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)


 Final Accuracy (RobustScaler): 0.6567
