## importing the libraries

In [None]:
# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# display all columns of the dataframe
pd.options.display.max_columns = None
import plotly.express as px

# display all rows of the dataframe
pd.options.display.max_rows = None
 
# to display the float values upto 6 decimal places     
pd.options.display.float_format = '{:.6f}'.format

# import train-test split 
from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling

from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV

# import the functions for visualizing the decision tree
import pydotplus
from IPython.display import Image  

In [None]:
pip install pydotplus

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
z = df.columns
print(z)

### let's take first column under analysis 

In [None]:
sns.distplot(df['Pregnancies'],hist=False)
plt.show()

In [None]:
df['Pregnancies'].skew() #it is moderately skewed to the right

In [None]:
df['Pregnancies'].kurt() # it has leptokurtic distribution


In [None]:
plt.boxplot(df['Pregnancies'])
plt.show()

#### from the graph the outlier seems to be abover the upper whisker!

In [None]:
df[df['Pregnancies']>13]   # The three outlier in the pregnencies column

In [None]:
sns.countplot(df['Pregnancies'])
plt.show()

#### from here we csn see pregnancy with value 2 has the highest frequqncy whereas 14,15,17 has the lowest

# 2 Glucose

In [None]:
sns.distplot(df['Glucose'])
plt.show()



In [None]:
df['Glucose'].skew()   # [-0.5,0.5]  -->approximately symmetrical

In [None]:
df['Glucose'].kurt()# it has gor a leptokurtic distribution

In [None]:
plt.boxplot(df['Glucose'])
plt.show()


### the boxplot is showing the outlier below the lower whisker! ie, zero.

# 3 .Blood Pressure!

#### normal blood pressure is at approx. 127/79 mmHg in men, and 122/77 mmHg in women

In [None]:
sns.distplot(df['BloodPressure']) 

In [None]:
df['BloodPressure'].skew() # blood presssure is a little left skewed

In [None]:
df[df['BloodPressure']==0]

#### there are 35 entires with blood pressure =0
#### If the blood pressure then falls completely to zero this means the heart is no longer beating or is in fibrillation. The patient is then clinically dead and permanent brain damage will occur within minutes unless appropriate steps are taken to restore the blood pressure. If this does not occur then biological death will soon follow.

#### what we can do?

1. Either we should drop the rows associated with  zero blood pressure or we can replace the blood pressure value with mean or median
2. The most robust approach would be Applying a mchine learning algorithm to predict the Continuous values of blood pressure!
3. Regession would do the job!



#### Plotting distribution of columns alltogether!

In [None]:
for i in df.columns[3:-1]:
    plt.subplots()
    sns.distplot(df[i])
    print('\n')
    print(f"the skewness of {i} is {df[i].skew()}")
    print('\n')
    print(f" the kurtosis of {i} is {df[i].kurt()}")
    

### The last column

In [None]:
plt.figure(figsize=(8,8))
fig = px.pie(df,names='Outcome')
fig.show()

# The dataset has 65.10% as non diabetic and 34.90 as diabetic

### Understanding thre relationship bwtween 2 variables 

1. Bivariate analysis
2. Multivariate analysis

#### Let's understand the trend between two variables!

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(y=df['Age'],x=df['BloodPressure'],hue=df['Outcome'])
plt.show()


In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True)
plt.show()


####  it is showing a highly posotive correlation between Age and Pregnancies

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(y = df['Age'],x = df['Pregnancies'],hue=df['Outcome'])

In [None]:
sns.swarmplot(y = df['Age'],x=df['Outcome'])

#### With swarmplot we can identify problems like at age 50 how many people going though diabetes

## Multivariate Analysis

In [None]:
plt.figure(figsize=(9,7))
sns.scatterplot(x = df['Age'],y=df['Insulin'],hue=df['Outcome'])

#### The data is showing something very intersting information, like people having zero insulin  aslo have Diabetes!


In [None]:
plt.figure(figsize=(9,7))
sns.scatterplot(x = df['BloodPressure'],y=df['Insulin'],hue=df['Outcome'])

In [None]:
df[(df["BloodPressure"]==0) & ((df['Insulin']==0) & (df['Outcome']==1))]

In [None]:
len(df[(df["BloodPressure"]==0) & ((df['Insulin']==0) & (df['Outcome']==1))]) 

#  splitting the data

In [None]:
df.head(1)

In [None]:
X = df.drop(['Outcome'],axis=1)

y = df['Outcome']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=1)

In [None]:
# create a generalized function to calculate the metrics values for test set
def get_test_report(model):
    
    # for test set:
    # test_pred: prediction made by the model on the test dataset 'X_test'
    # y_test: actual values of the target variable for the test dataset

    # predict the output of the target variable from the test data 
    test_pred = model.predict(X_test)

    # return the performace measures on test set
    return(classification_report(y_test, test_pred))



# create a generalized function to calculate the metrics values for train set
def get_train_report(model):
    
    # for test set:
    # test_pred: prediction made by the model on the test dataset 'X_test'
    # y_test: actual values of the target variable for the test dataset

    # predict the output of the target variable from the test data 
    train_pred = model.predict(X_train)

    # return the performace measures on test set
    return(classification_report(y_train, train_pred))

## Decision Tree for Classification

Decision Tree is a non-parametric supervised learning method. It builds a model in the form of a tree structure. It breaks down a dataset into smaller and smaller subsets, which is called splitting. A decision node is a node on which a decision of split is to be made. A node that can not be split further is known as the terminal/leaf node. A leaf node represents the decision. A decision tree can work with both numerical and categorical variables.

A decision tree for classification is built using criteria like the Gini index and entropy.

# Gini Index
Gini index measures the probability of the sample being wrongly classified. The value of the Gini index varies between 0 and 1. We choose the variable with a low Gini index. The Gini index of the variable is calculated as:

                                             𝐺𝑖𝑛𝑖=1−∑𝑛𝑖=1𝑝2𝑖

Where,
𝑝𝑖: Probability of occurrence of the class 'i'

# Entropy
Entropy is one of the criteria used to build the decision tree. It calculates the heterogeneity of the sample. The entropy is zero if the sample is completely homogeneous, and it is equal to 1 if the sample is equally divided. Entropy of the variable 'X' is calculated as:

                                            𝐸(𝑋)=−∑𝑐𝑖=1𝑝𝑖𝑙𝑜𝑔2𝑝𝑖

Where,
𝑝𝑖: Probability of occurrence of the class 'i'

And the conditional emtropy of the variable is given as:

                                            𝐸(𝑇,𝑋)=∑𝑐𝜖𝑋𝑃(𝑐)𝐸(𝑐)

Where,
𝑃(𝑐): Probability of occurrence of the class 'c'
𝐸(𝑐): Entropy of the class 'c'

The information gain is the difference between the entropy of the target variable and the entropy of the target variable given an independent variable. We split the on the variable that corresponds to the highest information gain.

Build a full decision tree model on a train dataset using 'entropy'.¶

In [None]:
# instantiate the 'DecisionTreeClassifier' object using 'entropy' criterion
# pass the 'random_state' to obtain the same samples for each time you run the code

decision_tree_classification = DecisionTreeClassifier( random_state = 10)

# fit the model using fit() on train data
decision_tree = decision_tree_classification.fit(X_train, y_train)

In [None]:
# save the column names in 'labels'
labels = X_train.columns

# export a decision tree in DOT format
# pass the 'dt_model' to export it to Graphviz
# pass the column names to 'feature_names'
# pass the required class labels to 'class_names'
dot_data = tree.export_graphviz(decision_tree_classification, feature_names = labels, class_names = ["0","1"],filled=True, rounded=True,
                special_characters=True )

# plot the decision tree using DOT format in 'dot_data'
graph = pydotplus.graph_from_dot_data(dot_data)  

# display the decision tree
Image(graph.create_png())

# double-click on the image below to get an expanded view

### pretty messy tree right?

1. Advantage of decision tree is that it explains everything in a layman terms! 

In [None]:
print(get_train_report(decision_tree))

##  Clear overfit!!!

#### find the best parameters

## 3.1 Tune the Hyperparameters using GridSearchCV (Decision Tree)

Hyperparameters are the parameters in the model that are preset by the user. GridSearch considers all the combinations of hyperparameters and returns the best hyperparameter values. We pass some of the hyperparameters in the decision tree to the GridSearchCV() and build the tree using the optimal values obtained using GridSearch method.

In [None]:
# create a dictionary with hyperparameters and its values
# pass the criteria 'entropy' and 'gini' to the parameter, 'criterion' 
# pass the range of values to 'max_depth' that assigns maximum depth of the tree
# 'max_features' assigns maximum number of features to consider for the best split. We pass the string 'sqrt' and 'log2'
# 'sqrt' considers maximum number of features equal to the square root of total features
# 'log2' considers maximum number of features equal to the log of total features with base 2
# pass the range of values to 'min_samples_split' that assigns minimum number of samples to split an internal node
# pass the range of values to 'min_samples_leaf' that assigns minimum number of samples required at the terminal/leaf node
# pass the range of values to 'max_leaf_nodes' that assigns maximum number of leaf nodes in the tree
tuned_paramaters = [{'criterion': ['entropy', 'gini'], 
                     'max_depth': range(2, 10),
                     'max_features': ["sqrt", "log2"],
                     'min_samples_split': range(2,10),
                     'min_samples_leaf': range(1,10),
                     'max_leaf_nodes': range(2, 10)}]
 
# instantiate the 'DecisionTreeClassifier' 
# pass the 'random_state' to obtain the same samples for each time you run the code
decision_tree_classification = DecisionTreeClassifier(random_state = 10)

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the decision tree classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 5
tree_grid = GridSearchCV(estimator = decision_tree_classification, 
                         param_grid = tuned_paramaters, 
                         cv = 5)

# fit the model on X_train and y_train using fit()
tree_grid_model = tree_grid.fit(X_train, y_train)

# get the best parameters
print('Best parameters for decision tree classifier: ', tree_grid_model.best_params_, '\n')

In [None]:
# instantiate the 'DecisionTreeClassifier'
# 'best_params_' returns the dictionary containing best parameter values and parameter name  
# 'get()' returns the value of specified parameter
# pass the 'random_state' to obtain the same samples for each time you run the code
dt_model = DecisionTreeClassifier(criterion = tree_grid_model.best_params_.get('criterion'),
                                  max_depth = tree_grid_model.best_params_.get('max_depth'),
                                  max_features = tree_grid_model.best_params_.get('max_features'),
                                  max_leaf_nodes = tree_grid_model.best_params_.get('max_leaf_nodes'),
                                  min_samples_leaf = tree_grid_model.best_params_.get('min_samples_leaf'),
                                  min_samples_split = tree_grid_model.best_params_.get('min_samples_split'),
                                  random_state = 10)

# use fit() to fit the model on the train set
dt_model = dt_model.fit(X_train, y_train)

In [None]:
# print the performance measures for train set for the model with best parameters
# call the function 'get_test_report'
# pass the decision tree using GridSearch to the function
print('Classification Report for train set: \n', get_train_report(dt_model))

In [None]:
# save the column names in 'labels'
labels = X_train.columns

# export a decision tree in DOT format
# pass the 'dt_model' to export it to Graphviz
# pass the column names to 'feature_names'
# pass the required class labels to 'class_names'
dot_data = tree.export_graphviz(dt_model, feature_names = labels, class_names = ["0","1"])  

# plot the decision tree using DOT format in 'dot_data'
graph = pydotplus.graph_from_dot_data(dot_data)  

# display the decision tree
Image(graph.create_png())

# double-click on the image below to get an expanded view

##  Tune the Hyperparameters using GridSearchCV (Random Forest)¶

In [None]:
# create a dictionary with hyperparameters and its values
# pass the criteria 'entropy' and 'gini' to the parameter, 'criterion' 
# pass a list of values to 'n_estimators' to build the different number of trees in the random forest
# pass a list of values to 'max_depth' that assigns maximum depth of the tree
# 'max_features' assigns maximum number of features to consider for the best split. We pass the string 'sqrt' and 'log2'
# 'sqrt' considers maximum number of features equal to the square root of total features
# 'log2' considers maximum number of features equal to the log of total features with base 2
# pass a list of values to 'min_samples_split' that assigns minimum number of samples to split an internal node
# pass a list of values to 'min_samples_leaf' that assigns minimum number of samples required at the terminal/leaf node
# pass a list of values to 'max_leaf_nodes' that assigns maximum number of leaf nodes in the tree
tuned_paramaters = [{'criterion': ['entropy', 'gini'],
                     'n_estimators': [10, 30, 50, 70, 90],
                     'max_depth': [10, 15, 20],
                     'max_features': ['sqrt', 'log2'],
                     'min_samples_split': [2, 5, 8, 11],
                     'min_samples_leaf': [1, 5, 9],
                     'max_leaf_nodes': [2, 5, 8, 11]}]
 
# instantiate the 'RandomForestClassifier' 
# pass the 'random_state' to obtain the same samples for each time you run the code
random_forest_classification = RandomForestClassifier(random_state = 10)

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the random forest classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 5
rf_grid = GridSearchCV(estimator = random_forest_classification, 
                       param_grid = tuned_paramaters, 
                       cv = 5)

# use fit() to fit the model on the train set
rf_grid_model = rf_grid.fit(X_train, y_train)

# get the best parameters
print('Best parameters for random forest classifier: ', rf_grid_model.best_params_, '\n')

### Build the model using the tuned hyperparameters.

In [None]:
# instantiate the 'RandomForestClassifier'
# 'best_params_' returns the dictionary containing best parameter values and parameter name  
# 'get()' returns the value of specified parameter
# pass the 'random_state' to obtain the same samples for each time you run the code
rf_model = RandomForestClassifier(criterion = rf_grid_model.best_params_.get('criterion'), 
                                  n_estimators = rf_grid_model.best_params_.get('n_estimators'),
                                  max_depth = rf_grid_model.best_params_.get('max_depth'),
                                  max_features = rf_grid_model.best_params_.get('max_features'),
                                  max_leaf_nodes = rf_grid_model.best_params_.get('max_leaf_nodes'),
                                  min_samples_leaf = rf_grid_model.best_params_.get('min_samples_leaf'),
                                  min_samples_split = rf_grid_model.best_params_.get('min_samples_split'),
                                  random_state = 10)

# use fit() to fit the model on the train set
rf_model = rf_model.fit(X_train, y_train)

# print the performance measures for test set for the model with best parameters
print('Classification Report for test set:\n', get_test_report(rf_model))

## Identify the Important Features
Let us create a barplot to identify the important feature in the dataset.

The method feature_importances_ returns the value corresponding to each feature which is defined as the ratio of total decrease in Gini impurity across every tree in the forest where the feature is used to the total count of trees in the forest. This is also caled as, Gini Importance.

There is another accuracy-based method. It calculates the average decrease in the accuracy calculated on the out-of-bag samples, with and without shuffling the variable across all the trees in the random forest. The out-of-bag samples are the samples in the training dataset which are not considered whild building a tree.

In [None]:
# create a dataframe that stores the feature names and their importance
# 'feature_importances_' returns the features based on the gini importance
important_features = pd.DataFrame({'Features': X_train.columns, 
                                   'Importance': rf_model.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()