# Exercise

1. Load heart disease dataset in pandas dataframe
2. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
3. Convert text columns to numbers using label encoding and one hot encoding
4. Apply scaling
5. Build a classification model using support vector machine. Use standalone model as well as Bagging model and check if you see any  difference in the performance.
6. Now use decision tree classifier. Use standalone model as well as Bagging and check if you notice any difference in performance
7. Comparing performance of svm and decision tree classifier figure out where it makes most sense to use bagging and why. Use internet to figure out in what conditions bagging works the best.

### Import library

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

### Read data

In [2]:
df = pd.read_csv("./heart.csv")

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### Remove outlier

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
df[df.RestingBP > df.RestingBP.mean() + 3 * df.RestingBP.std()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1


In [5]:
df[df.Cholesterol > df.Cholesterol.mean() + 3 * df.Cholesterol.std()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [6]:
df[df.FastingBS > df.FastingBS.mean() + 3 * df.FastingBS.std()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [7]:
df[df.MaxHR > df.MaxHR.mean() + 3 * df.MaxHR.std()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [8]:
df[df.Oldpeak > df.Oldpeak.mean() + 3 * df.Oldpeak.std()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


In [9]:
df = df[df.RestingBP <= df.RestingBP.mean() + 3 * df.RestingBP.std()]
df = df[df.Cholesterol <= df.Cholesterol.mean() + 3 * df.Cholesterol.std()]
df = df[df.Oldpeak <= df.Oldpeak.mean() + 3 * df.Oldpeak.std()]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 902 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             902 non-null    int64  
 1   Sex             902 non-null    object 
 2   ChestPainType   902 non-null    object 
 3   RestingBP       902 non-null    int64  
 4   Cholesterol     902 non-null    int64  
 5   FastingBS       902 non-null    int64  
 6   RestingECG      902 non-null    object 
 7   MaxHR           902 non-null    int64  
 8   ExerciseAngina  902 non-null    object 
 9   Oldpeak         902 non-null    float64
 10  ST_Slope        902 non-null    object 
 11  HeartDisease    902 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 91.6+ KB


### Convert categorical data

In [11]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [12]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [13]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [14]:
df.ExerciseAngina.replace({'N': 0, 'Y': 1}, inplace=True)

df.ST_Slope.replace({'Down': 1, 'Flat': 2, 'Up': 3}, inplace=True)

df.RestingECG.replace({'Normal': 1, 'ST': 2, 'LVH': 3}, inplace=True)

In [15]:
df = pd.get_dummies(df, drop_first=True)

df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,1,132,0,1.2,2,1,1,0,0,1
914,68,144,193,1,1,141,0,3.4,2,1,1,0,0,0
915,57,130,131,0,1,115,1,1.2,2,1,1,0,0,0
916,57,130,236,0,3,174,0,0.0,2,1,0,1,0,0


In [16]:
X = df.drop('HeartDisease', axis='columns')
y = df.HeartDisease

### Scaling

In [17]:
scaler = StandardScaler()

scaled_X = scaler.fit_transform(X)

### Modeling - SVC

In [18]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2)

In [19]:
data = {
    'svm':{
        'model': SVC(),
        'params':{
            "C": range(1, 100, 10),
            "kernel":['linear', 'poly', 'rbf', 'sigmoid'],
            "gamma":['auto', 'scale']
        }
    }
}

In [20]:
scores = []

for model_name, mp in data.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

score_df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
score_df

Unnamed: 0,model,best_score,best_params
0,svm,0.857184,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}"


### Bagging - SVC

In [23]:
clf = BaggingClassifier(estimator=SVC(gamma='auto'),
                        n_estimators=100, 
                        max_samples=0.8,
                        oob_score=True)

clf.fit(X_train, y_train)
clf.oob_score_

0.8585298196948682

### Modeling - Decision Tree

In [28]:
data = {
    'decision_tree':{
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini', 'entropy', 'log_loss'],
            'max_features':['sqrt', 'log2', None],
        }
    }
}

In [29]:
scores = []

for model_name, mp in data.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

score_df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
score_df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.790556,"{'criterion': 'entropy', 'max_features': 'sqrt'}"


In [31]:
clf = BaggingClassifier(estimator=DecisionTreeClassifier(criterion='entropy', max_features='sqrt'),
                        n_estimators=100, 
                        max_samples=0.8,
                        oob_score=True)

clf.fit(X_train, y_train)
clf.oob_score_

0.8779472954230236