In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import mlflow
import dvc

from sklearn.preprocessing import  (StandardScaler,
                                    MinMaxScaler,
                                    OneHotEncoder,
                                    LabelEncoder,
                                    )
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import (Pipeline,
                              make_pipeline
                              )
from sklearn.metrics import   (accuracy_score,
                               precision_score,
                               r2_score,
                               confusion_matrix)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [14]:
df = pd.read_csv(r'C:/Users/Sande/Desktop/project_2/notebook/heart.csv')
df.head(4)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1


In [15]:
df.tail()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up,0


In [16]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [17]:
df.duplicated().sum()

np.int64(0)

In [18]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


In [19]:
df['Sex'].value_counts()

Sex
M    725
F    193
Name: count, dtype: int64

In [20]:
df['ChestPainType'].value_counts()

ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

In [21]:
df['ChestPainType'] = df['ChestPainType'].replace({
    "ASY": 4,
    "NAP":3,
    "ATA":2,
    "TA":1
    })
df['ChestPainType'] = df['ChestPainType'].astype(int)


  df['ChestPainType'] = df['ChestPainType'].replace({


In [22]:
df['RestingECG'].value_counts()

RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64

In [23]:
df['RestingECG'] = df['RestingECG'].replace({
    "Normal": 1,
    "ST": 2,
    "LVH":3
    })
df['RestingECG'] = df['RestingECG'].astype(int)

  df['RestingECG'] = df['RestingECG'].replace({


In [24]:
df['ExerciseAngina'].value_counts()

ExerciseAngina
N    547
Y    371
Name: count, dtype: int64

In [25]:
df['ST_Slope'].value_counts()

ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64

In [26]:
df['ST_Slope'] = df['ST_Slope'].replace({
    "Flat": 1,
    "Up": 2,
    "Down": 3
    })
df['ST_Slope'] = df['ST_Slope'].astype(int)

  df['ST_Slope'] = df['ST_Slope'].replace({


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int64  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int64  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(9), object(2)
memory usage: 86.2+ KB


In [30]:
process = ColumnTransformer(transformers=[
    ('one', OneHotEncoder(), ['ExerciseAngina', 'Sex']),
    ('std', StandardScaler(), ['MaxHR', 'Age', 'RestingBP', 'Cholesterol'])
],
    remainder='passthrough'
)

In [31]:
Pipeline = Pipeline(steps=[
    ('proprocess', process),
    ('classi', RandomForestClassifier())
])

In [32]:
train_data, test_data = train_test_split(df, random_state=42, test_size=0.21)

In [34]:
X_train = train_data.drop(columns=['HeartDisease'])
y_train = train_data['HeartDisease']

X_test = test_data.drop(columns=['HeartDisease'])
y_test = test_data['HeartDisease']

In [35]:
Pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [36]:
y_pred = Pipeline.predict(X_test)

In [37]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9015544041450777
