# SetUp

imports and loading inputs

In [None]:
import pandas as pd
import numpy as np
diabetes_path = '../input/diabetes_prediction_dataset.csv'

diabetes_data = pd.read_csv(diabetes_path)

# Data cleaning

### Data information

In [None]:
# discribtion of dataset
diabetes_data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
# shape of dataset
diabetes_data.shape

(100000, 9)

In [None]:
# summary of dataset
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [None]:
# first 5 rows of the data
diabetes_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


### check for missing data

In [None]:
missing_values_count = diabetes_data.isnull().sum()
missing_values_count

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

### check unique values in column that holds objects

In [None]:
object_columns = ['gender', 'smoking_history']
for column in object_columns:
    print(column + ": ", diabetes_data[column].unique())

gender:  ['Female' 'Male' 'Other']
smoking_history:  ['never' 'No Info' 'current' 'former' 'ever' 'not current']


For the `gender` column, drop any row with `Other` gender

In [None]:
# Filter out rows with 'gender' other than 'Female' or 'Male'
diabetes_data = diabetes_data[(diabetes_data['gender'] == 'Female') | (diabetes_data['gender'] == 'Male')]

# Show the new unique values
print(diabetes_data['gender'].unique())

# show the new size of the dataset
print(diabetes_data.shape)

['Female' 'Male']
(99982, 9)


For the `smoking_history` column, replace any `No Info` with `None`.

In [None]:
diabetes_data['smoking_history'] = diabetes_data['smoking_history'].replace('No Info', np.nan)
# Show the new missing values
missing_values_count= diabetes_data.isnull().sum()
print(missing_values_count, '\n')
print(diabetes_data.info())

gender                     0
age                        0
hypertension               0
heart_disease              0
smoking_history        35810
bmi                        0
HbA1c_level                0
blood_glucose_level        0
diabetes                   0
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
Index: 99982 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               99982 non-null  object 
 1   age                  99982 non-null  float64
 2   hypertension         99982 non-null  int64  
 3   heart_disease        99982 non-null  int64  
 4   smoking_history      64172 non-null  object 
 5   bmi                  99982 non-null  float64
 6   HbA1c_level          99982 non-null  float64
 7   blood_glucose_level  99982 non-null  int64  
 8   diabetes             99982 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 7.6+ MB
None


Show the unique values of the `smoking_history` after the change 

In [None]:
diabetes_data['smoking_history'].unique()

array(['never', nan, 'current', 'former', 'ever', 'not current'],
      dtype=object)

Number and percentage of the rows with missing `smoking_history` value

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = (missing_values_count / len(diabetes_data)) * 100

print("Missing calues count: \n", missing_values_count.sum())
print("Missing values percentage: \n", missing_percentage.sum(), "%")

Missing calues count: 
 35810
Missing values percentage: 
 35.81644696045288 %


So these missing values are a problem. 
* If we drop them we will lose lots of data which will affect the prediction.
* Change nan with the previous value could be a solution, but not the best one.

In [None]:
diabetes_data['smoking_history'] = diabetes_data['smoking_history'].ffill().fillna('never')

# calculate the new missing values
missing_values_count = diabetes_data.isnull().sum().sum()
print("Missing values count: ", missing_values_count)

# show the new unique values for this column
print(diabetes_data['smoking_history'].unique())

Missing values count:  0
['never' 'current' 'former' 'ever' 'not current']


At this cell, we removed any missing data.

In [None]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99982 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               99982 non-null  object 
 1   age                  99982 non-null  float64
 2   hypertension         99982 non-null  int64  
 3   heart_disease        99982 non-null  int64  
 4   smoking_history      99982 non-null  object 
 5   bmi                  99982 non-null  float64
 6   HbA1c_level          99982 non-null  float64
 7   blood_glucose_level  99982 non-null  int64  
 8   diabetes             99982 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 7.6+ MB


### check for the data dublicates

In [None]:
# Check for duplicates in the entire DataFrame
duplicates = diabetes_data.duplicated()

# Count the number of duplicates
num_duplicates = duplicates.sum()

# Display the number of duplicates
print("Number of duplicates:", num_duplicates)

# Calculate the percentage of duplicates
percentage_duplicates = (num_duplicates / len(diabetes_data)) * 100

# Display the percentage of duplicates
print("Percentage of duplicates:", percentage_duplicates, "%")

Number of duplicates: 3658
Percentage of duplicates: 3.6586585585405373 %


As shown in the disscuion tab in the dataset, this duplicates happened because each person can make many checks for diabetes.
As it is only small percentage of the data, we choose to drop all the duplicates.

In [None]:
diabetes_data.drop_duplicates(inplace=True)

In [None]:
# Calculate the new duplicates in the entire DataFrame
duplicates = diabetes_data.duplicated()

# Count the number of duplicates
num_duplicates = duplicates.sum()

# Display the number of duplicates
print("Number of duplicates:", num_duplicates, '\n')


Number of duplicates: 0 



### Summary

At this step, data should be consistent. The next step show all the data info after the data cleaning step

In [None]:
# print the info
(diabetes_data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 96324 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96324 non-null  object 
 1   age                  96324 non-null  float64
 2   hypertension         96324 non-null  int64  
 3   heart_disease        96324 non-null  int64  
 4   smoking_history      96324 non-null  object 
 5   bmi                  96324 non-null  float64
 6   HbA1c_level          96324 non-null  float64
 7   blood_glucose_level  96324 non-null  int64  
 8   diabetes             96324 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 7.3+ MB


In [None]:
# print the missing values and the duplicates count
print("missing values")
print(diabetes_data.isnull().sum())
print("\nduplicates number:",diabetes_data.duplicated().sum())

missing values
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

duplicates number: 0


# Data analysis (prediction)

### SetUp
imports

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

### Predict `diabetes`

#### Define Target and Features

Mapping Objects columns into numeric values:

In [None]:
dummy_cols = pd.get_dummies(diabetes_data, columns=['gender', 'smoking_history'])
dummy_cols.columns

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'gender_Female', 'gender_Male',
       'smoking_history_current', 'smoking_history_ever',
       'smoking_history_former', 'smoking_history_never',
       'smoking_history_not current'],
      dtype='object')

In [None]:
features = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'blood_glucose_level', 'gender_Female', 'gender_Male',
       'smoking_history_current', 'smoking_history_ever',
       'smoking_history_former', 'smoking_history_never',
       'smoking_history_not current']

X = dummy_cols[features]
X = X
y = dummy_cols.diabetes
y = y


print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
Index: 96324 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          96324 non-null  float64
 1   hypertension                 96324 non-null  int64  
 2   heart_disease                96324 non-null  int64  
 3   bmi                          96324 non-null  float64
 4   HbA1c_level                  96324 non-null  float64
 5   blood_glucose_level          96324 non-null  int64  
 6   blood_glucose_level          96324 non-null  int64  
 7   gender_Female                96324 non-null  bool   
 8   gender_Male                  96324 non-null  bool   
 9   smoking_history_current      96324 non-null  bool   
 10  smoking_history_ever         96324 non-null  bool   
 11  smoking_history_former       96324 non-null  bool   
 12  smoking_history_never        96324 non-null  bool   
 13  smoking_history_not c

Split data

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y)
train_X = train_X
train_y = train_y
val_X = val_X
val_y = val_y

#### Using Decision tree

Error evaluation function

In [None]:
def get_accuracy(max_leaf_nodes, train_X, val_X, train_y, val_y):
    # Create decision tree model with specified max_leaf_nodes
    model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0)
    
    # Fit the model on training data
    model.fit(train_X, train_y)
    
    # Make predictions on validation data
    preds_val = model.predict(val_X)
    
    # Calculate accuracy
    accuracy_value = accuracy_score(val_y.astype(int), preds_val.astype(int))
    
    return accuracy_value

getting the best `max_leaf_nodes` to get the best possible accuracy, Loop to get the best depth of the tree

In [None]:
max_depth_values = range(2, 300, 5)

# Dictionary to store the accuracy for each max_depth
accuracy_scores = {}

# Loop to calculate the accuracy for each value of max_depth
for max_depth in max_depth_values:
    # Calculate accuracy using the get_mae function
    accuracy_value = get_accuracy(max_depth, train_X, val_X, train_y, val_y)
    
    # Store the accuracy score for the current max_depth
    accuracy_scores[max_depth] = accuracy_value

# Find the max_depth with the highest accuracy
best_max_depth = max(accuracy_scores, key=accuracy_scores.get)
best_accuracy = accuracy_scores[best_max_depth]

print(f"Best max_depth: {best_max_depth}, Acuracy: {best_accuracy*100}%")

Best max_depth: 7, Acuracy: 97.19696025912545%


Use the `best_max_depth` to train the best tree possible, then calculate the accuracy.

In [None]:
 # Create decision tree model with specified max_leaf_nodes
model = DecisionTreeClassifier(max_leaf_nodes=best_max_depth)

# Fit the model on training data
model.fit(train_X, train_y)

# Make predictions on validation data
preds_val = model.predict(val_X)

# Calculate accuracy
accuracy_value = accuracy_score(val_y.astype(int), preds_val.astype(int))

print(f"accuracy: {accuracy_value*100}%")

accuracy: 97.19696025912545%


#### Using Random forest

In [None]:
model = RandomForestClassifier()

model.fit(train_X, train_y)

preds_val = model.predict(val_X)

print("Accuracy:", accuracy_score(val_y.astype(int), preds_val.astype(int))*100, "%")

Accuracy: 96.81491632407293 %


#### Using Nueral network 

# Data visualization