Download heart disease dataset heart.csv in [Exercise](https://github.com/codebasics/py/tree/master/ML/18_PCA/Exercise) folder and do following, (credits of dataset:  https://www.kaggle.com/fedesoriano/heart-failure-prediction)

1. Load heart disease dataset in pandas dataframe
1. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
1. Convert text columns to numbers using label encoding and one hot encoding
1. Apply scaling
1. Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
1. Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life


[Solution Link](https://github.com/codebasics/py/blob/master/ML/18_PCA/Exercise/PCA_heart_disease_prediction_exercise_solution.ipynb)



 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
# According to the above describe outlier seems like at column:
# RestingBP(see the min)
# Cholesterol (see the min and max)
# Oldpeak  (see the min and max)

## Outlier can treat by 3 method
1. IQR
2. Z-score (column use)
3. formula using mean and std

### For Cholesterol  ---  using IQR

In [7]:
Q1=df.Cholesterol.quantile(0.25)
Q3=df.Cholesterol.quantile(0.75)
IQR=Q3-Q1

In [8]:
lower_limit = Q1-1.5*IQR
upper_limit = Q3+1.5*IQR

In [9]:
df=df[(df.Cholesterol>lower_limit) & (df.Cholesterol<upper_limit)]
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### For RestingBP --- using "formula using mean and std"
#### lower limit= mean - 3*std
#### upper limit= mean + 3*std


In [10]:
# in industries usually '3' standard deviations are used. so '3' is multiplied

lower_limit1= df.RestingBP.mean() - (3*df.RestingBP.std())
upper_limit1= df.RestingBP.mean() + (3*df.RestingBP.std())

In [11]:
df=df[(df.RestingBP>lower_limit1) & (df.RestingBP<upper_limit1)]
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### For Oldpeak --- using "Z-Score"

### Z = (X - Mean)/Standard_Deviation

In [12]:
Z_scores = (df.Oldpeak - df.Oldpeak.mean())/df.Oldpeak.std()
df['Z']=Z_scores
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Z
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-0.839668
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.092871
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.839668
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.55914
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.839668


In [13]:
df=df[(df.Z>-3) & (df.Z<3)]
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Z
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-0.839668
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.092871
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.839668
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.55914
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.839668


In [14]:
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.compose import ColumnTransformer


In [15]:
le=LabelEncoder()
df.Sex=le.fit_transform(df.Sex)

le1=LabelEncoder()
df.ChestPainType=le.fit_transform(df.ChestPainType)

le2=LabelEncoder()
df.RestingECG=le.fit_transform(df.RestingECG)

le3=LabelEncoder()
df.ExerciseAngina=le.fit_transform(df.ExerciseAngina)

le4=LabelEncoder()
df.ST_Slope=le.fit_transform(df.ST_Slope)

In [16]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Z
0,40,1,1,140,289,0,1,172,0,0.0,2,0,-0.839668
1,49,0,2,160,180,0,1,156,0,1.0,1,1,0.092871
2,37,1,1,130,283,0,2,98,0,0.0,2,0,-0.839668
3,48,0,0,138,214,0,1,108,1,1.5,1,1,0.55914
4,54,1,2,150,195,0,1,122,0,0.0,2,0,-0.839668


In [17]:
ct=ColumnTransformer(transformers=[('ChestPainType',OneHotEncoder(),['ChestPainType']),('RestingECG',OneHotEncoder(),['RestingECG'])],remainder='passthrough')

In [18]:
df1 =ct.fit_transform(df)

In [19]:
df1=pd.DataFrame(df1)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,40.0,1.0,140.0,289.0,0.0,172.0,0.0,0.0,2.0,0.0,-0.839668
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,49.0,0.0,160.0,180.0,0.0,156.0,0.0,1.0,1.0,1.0,0.092871
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,37.0,1.0,130.0,283.0,0.0,98.0,0.0,0.0,2.0,0.0,-0.839668
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,48.0,0.0,138.0,214.0,0.0,108.0,1.0,1.5,1.0,1.0,0.55914
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,54.0,1.0,150.0,195.0,0.0,122.0,0.0,0.0,2.0,0.0,-0.839668


In [20]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [21]:
svc= SVC()
# svc.fit(df.drop(df1.columns[16],axis=1),df1[16])

if len(df1.columns) > 16:  # Check if column at index 16 exists
    column_to_drop = df1.columns[16]
    X = df1.drop(column_to_drop, axis=1)
    y = df1[column_to_drop]
    svc.fit(X, y)
else:
    print("Column at index 16 does not exist in the DataFrame.")

In [22]:
svc.score(X,y)

0.6933701657458563

In [23]:
lr=LogisticRegression()

if len(df1.columns) > 16:  # Check if column at index 16 exists
    column_to_drop = df1.columns[16]
    X1 = df1.drop(column_to_drop, axis=1)
    y1 = df1[column_to_drop]
    lr.fit(X1, y1)
else:
    print("Column at index 16 does not exist in the DataFrame.")

In [24]:
lr.score(X1,y1)

0.8577348066298343

In [25]:
rfc=RandomForestClassifier()

In [26]:
if len(df1.columns) > 16:  # Check if column at index 16 exists
    column_to_drop = df1.columns[16]
    X2 = df1.drop(column_to_drop, axis=1)
    y2 = df1[column_to_drop]
    rfc.fit(X2, y2)
else:
    print("Column at index 16 does not exist in the DataFrame.")

In [27]:
rfc.score(X2,y2)

1.0

### StandardScaler + PCA

In [28]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_scale=ss.fit_transform(X2)

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_scale,y2,test_size=0.2)
rfc.fit(X_train, y_train)
rfc.score(X_test,y_test)

0.8551724137931035

In [30]:
rfc.score(X_train, y_train)

1.0

In [33]:
from sklearn.decomposition import PCA
pca=PCA()
x_pca=pca.fit_transform(x_scale)

In [34]:
from sklearn.model_selection import train_test_split
X_train_pca,X_test_pca,y_train,y_test=train_test_split(x_pca,y2,test_size=0.2)
rfc.fit(X_train_pca, y_train)
rfc.score(X_test_pca,y_test)

0.8758620689655172

In [35]:
rfc.score(X_train_pca, y_train)

1.0

### MinMaxScaler + PCA

In [36]:
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()
x_scale1=mms.fit_transform(X2)

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_scale1,y2,test_size=0.2)
rfc.fit(X_train, y_train)
rfc.score(X_test,y_test)

0.8413793103448276

In [40]:
pca1=PCA()
x_pca1=pca1.fit_transform(x_scale1)

In [41]:
X_train_pca1,X_test_pca1,y_train,y_test=train_test_split(x_pca1,y2,test_size=0.2)
rfc.fit(X_train_pca1, y_train)
rfc.score(X_test_pca1,y_test)

0.8206896551724138

In [42]:
rfc.score(X_train_pca1, y_train)

1.0