# Task
## Perform the following steps:
- Load ANY dataset from pycaret library.
- Train the model using different algorithms, and compare the metrics.
- Apply Normalization and Outlier Removal and report the performance change.

## Importing Pycaret

In [1]:
import pycaret

## Ingesting the DataSet

In [5]:
from pycaret.datasets import get_data
titanicDataSet = get_data('titanic')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
titanicDataSet.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Setting Up the Classification Environment

In [8]:
from pycaret.classification import *
s = setup(data =titanicDataSet, target ='Survived')

Unnamed: 0,Description,Value
0,Session id,6214
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


## Building and Comparing Models

In [11]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8073,0.8452,0.6855,0.7948,0.7319,0.583,0.5906,0.497
ridge,Ridge Classifier,0.7302,0.8327,0.4016,0.813,0.5198,0.3678,0.4156,0.039
et,Extra Trees Classifier,0.6788,0.71,0.238,0.7688,0.3509,0.2171,0.2799,0.08
nb,Naive Bayes,0.6676,0.7679,0.1755,0.8217,0.2841,0.1747,0.2628,0.036
lda,Linear Discriminant Analysis,0.6196,0.5109,0.0304,0.0583,0.04,0.0195,0.0215,0.036
rf,Random Forest Classifier,0.6195,0.7802,0.0125,0.15,0.0231,0.012,0.0273,0.096
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.032
qda,Quadratic Discriminant Analysis,0.6164,0.4936,0.0,0.0,0.0,0.0,0.0,0.036
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.04
gbc,Gradient Boosting Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.056


## Applying Normalization and Outlier Removal

### Normalization

In [16]:
setup_model = setup(data = titanicDataSet, target = 'Survived', normalize = 'True', normalize_method = 'zscore')
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,6673
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7752,0.8054,0.5192,0.8349,0.6343,0.4869,0.5184,0.053
svm,SVM - Linear Kernel,0.7189,0.7528,0.4346,0.7214,0.5177,0.3513,0.379,0.036
lr,Logistic Regression,0.703,0.8477,0.3179,0.7741,0.4411,0.2911,0.3448,0.04
et,Extra Trees Classifier,0.6884,0.7903,0.2424,0.7496,0.3551,0.2367,0.3039,0.085
nb,Naive Bayes,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.045
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.046
ridge,Ridge Classifier,0.6164,0.8181,0.0,0.0,0.0,0.0,0.0,0.037
rf,Random Forest Classifier,0.6164,0.8278,0.0,0.0,0.0,0.0,0.0,0.09
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.034
gbc,Gradient Boosting Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.064


### Outlier Removal

In [19]:
## outliers_threshold = 0.05 is the default value.
model_setup = setup(data = titanicDataSet, target = 'Survived', remove_outliers= True, outliers_threshold= 0.05)
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,2271
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(859, 14)"
5,Transformed train set shape,"(591, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8137,0.8555,0.6864,0.7997,0.7373,0.5946,0.5998,0.121
ridge,Ridge Classifier,0.7528,0.8447,0.448,0.8415,0.5795,0.4275,0.4727,0.078
et,Extra Trees Classifier,0.7079,0.7791,0.2935,0.8633,0.432,0.2955,0.3757,0.117
nb,Naive Bayes,0.6628,0.7678,0.1764,0.7547,0.2802,0.1657,0.2382,0.072
knn,K Neighbors Classifier,0.6452,0.6073,0.3768,0.5732,0.4505,0.2036,0.217,0.106
rf,Random Forest Classifier,0.6276,0.8126,0.0462,0.55,0.0847,0.0433,0.1009,0.122
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.069
qda,Quadratic Discriminant Analysis,0.6164,0.4532,0.0,0.0,0.0,0.0,0.0,0.067
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.079
gbc,Gradient Boosting Classifier,0.6164,0.4987,0.0,0.0,0.0,0.0,0.0,0.081


### Applying both Normalization and Outlier Removal

In [22]:
model_setup = setup(data = titanicDataSet, target= 'Survived', normalize = True, normalize_method= 'zscore',
                           remove_outliers= True, outliers_threshold= 0.05)
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,7918
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(859, 14)"
5,Transformed train set shape,"(591, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7512,0.79,0.4764,0.8003,0.5935,0.4305,0.4629,0.097
svm,SVM - Linear Kernel,0.7014,0.724,0.338,0.7462,0.449,0.2913,0.3378,0.08
et,Extra Trees Classifier,0.6903,0.795,0.2301,0.8798,0.3594,0.2392,0.3349,0.111
lr,Logistic Regression,0.6887,0.8205,0.3051,0.7295,0.4237,0.2593,0.3076,0.076
rf,Random Forest Classifier,0.6405,0.8269,0.0668,0.75,0.121,0.0771,0.1675,0.12
nb,Naive Bayes,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.074
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.084
ridge,Ridge Classifier,0.6164,0.7903,0.0,0.0,0.0,0.0,0.0,0.079
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.079
gbc,Gradient Boosting Classifier,0.6164,0.4902,0.0,0.0,0.0,0.0,0.0,0.101
