# Experiment - 2.3
- Build a classification model by using different machine learning algorithms.

## Installing Libraries

In [1]:
import pycaret
print(pycaret.__version__)

3.4.0


## Ingesting the Required Dataset

In [4]:
from pycaret.datasets import get_data
diabetesDataSet = get_data('diabetes')    # Loads the "diabetes" dataset, which is a binary classification problem
                                          # The target column, "Class variable," has two classes (binary values)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
diabetesDataSet.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Setup Classification Environment

In [8]:
from pycaret.classification import *  
# To perform classification using Pycaret

In [12]:
s = setup(data=diabetesDataSet, target='Class variable')
# Initializes the PyCaret classification environment
# Specifies the dataset and the target column to be used for training

Unnamed: 0,Description,Value
0,Session id,7083
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


## Building and Comparing Models

In [13]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7597,0.8012,0.55,0.7115,0.6108,0.4429,0.4562,0.076
et,Extra Trees Classifier,0.7559,0.7974,0.5327,0.7043,0.5983,0.4297,0.4429,0.061
lr,Logistic Regression,0.7558,0.7966,0.5222,0.7138,0.5982,0.4287,0.443,0.47
ridge,Ridge Classifier,0.7558,0.7987,0.5117,0.7217,0.5937,0.426,0.4426,0.024
lda,Linear Discriminant Analysis,0.7539,0.799,0.517,0.7105,0.5939,0.4235,0.4378,0.015
nb,Naive Bayes,0.7522,0.7852,0.5719,0.6929,0.6187,0.4379,0.4482,0.011
gbc,Gradient Boosting Classifier,0.741,0.7733,0.5816,0.6515,0.6108,0.4178,0.4223,0.048
ada,Ada Boost Classifier,0.7337,0.7482,0.5447,0.6597,0.5901,0.396,0.4046,0.034
qda,Quadratic Discriminant Analysis,0.7298,0.7949,0.5193,0.655,0.5703,0.3789,0.3898,0.011
lightgbm,Light Gradient Boosting Machine,0.7168,0.7617,0.5503,0.6124,0.5752,0.3645,0.369,0.075


## Data Normalization

In [19]:
# Commonly used techniques: clipping, log scaling, z - score, minmax, maxabs, robust
model_setup = setup(data=diabetesDataSet, target='Class variable', normalize='True', normalize_method='zscore')
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,3256
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7673,0.8327,0.562,0.7126,0.6247,0.4604,0.4696,0.017
rf,Random Forest Classifier,0.7672,0.8291,0.6202,0.6924,0.6501,0.4769,0.4818,0.072
ridge,Ridge Classifier,0.7654,0.8347,0.5512,0.7109,0.6169,0.4531,0.463,0.017
gbc,Gradient Boosting Classifier,0.7653,0.8294,0.636,0.683,0.6528,0.4767,0.4817,0.057
et,Extra Trees Classifier,0.7616,0.8372,0.5942,0.6904,0.6329,0.4589,0.466,0.061
lda,Linear Discriminant Analysis,0.7598,0.8347,0.5512,0.6972,0.6119,0.4426,0.4512,0.016
nb,Naive Bayes,0.7503,0.8064,0.605,0.6634,0.627,0.4414,0.447,0.014
ada,Ada Boost Classifier,0.743,0.8115,0.5827,0.6544,0.6123,0.4218,0.4264,0.048
qda,Quadratic Discriminant Analysis,0.7429,0.8188,0.5725,0.6511,0.6064,0.4176,0.4215,0.017
svm,SVM - Linear Kernel,0.7412,0.7891,0.6202,0.6339,0.6201,0.4254,0.4304,0.015


## Outlier Removal

In [22]:
## outliers_threshold = 0.05 is the default value.
model_setup= setup(data =diabetesDataSet, target ='Class variable', remove_outliers=True, outliers_threshold=0.05)

Unnamed: 0,Description,Value
0,Session id,1755
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(741, 9)"
5,Transformed train set shape,"(510, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple
