In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
import pandas as pd
import category_encoders as ce

In [30]:
df = pd.read_csv('ML datasets/heart_classify.csv')

In [31]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [6]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [32]:
df['ST_Slope'][df['ST_Slope']=='Up'] = 1
df['ST_Slope'][df['ST_Slope']=='Flat'] = 0
df['ST_Slope'][df['ST_Slope']=='Down'] = -1
df['ST_Slope'] = df['ST_Slope'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ST_Slope'][df['ST_Slope']=='Up'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ST_Slope'][df['ST_Slope']=='Flat'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ST_Slope'][df['ST_Slope']=='Down'] = -1


In [33]:
df['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [34]:
df['ExerciseAngina'][df['ExerciseAngina']=='Y'] = 1
df['ExerciseAngina'][df['ExerciseAngina']=='N'] = 0
df['ExerciseAngina'] = df['ExerciseAngina'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ExerciseAngina'][df['ExerciseAngina']=='Y'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ExerciseAngina'][df['ExerciseAngina']=='N'] = 0


In [35]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [36]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina      int32
Oldpeak           float64
ST_Slope            int32
HeartDisease        int64
dtype: object

In [37]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [38]:
df.loc[df['Sex']=='M','Sex'] = 1
df.loc[df['Sex']=='F','Sex'] = 0
df.Sex = df.Sex.astype('int')

In [39]:
df['Sex']

0      1
1      0
2      1
3      0
4      1
      ..
913    1
914    1
915    1
916    0
917    1
Name: Sex, Length: 918, dtype: int32

In [43]:
df.drop(columns=['ChestPainType'], inplace=True)

##### removing chest pain column to keep the data scientific

In [45]:
df = pd.get_dummies(df)

In [48]:
df.dtypes

Age                    int64
Sex                    int32
RestingBP              int64
Cholesterol            int64
FastingBS              int64
MaxHR                  int64
ExerciseAngina         int32
Oldpeak              float64
ST_Slope               int32
HeartDisease           int64
RestingECG_LVH         uint8
RestingECG_Normal      uint8
RestingECG_ST          uint8
dtype: object

In [49]:
df['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [50]:
x = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,shuffle=True)

In [52]:
x_test

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
406,57,1,140,0,1,100,1,0.0,0,0,1,0
835,54,1,110,239,0,126,1,2.8,0,0,1,0
796,56,0,134,409,0,150,1,1.9,0,1,0,0
83,52,1,160,196,0,165,0,0.0,1,0,1,0
594,58,1,160,256,1,113,1,1.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
583,69,1,142,271,0,126,0,0.3,1,1,0,0
786,69,1,140,254,0,146,0,2.0,0,1,0,0
736,54,1,122,286,0,116,1,3.2,0,1,0,0
447,77,1,124,171,0,110,1,2.0,1,0,0,1


In [54]:
scale = StandardScaler()
scale_data = scale.fit_transform(x_train)

In [56]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(x_train,y_train)

In [59]:
y_predictions_dt = dec_tree.predict(x_test)

In [60]:
metrices = {'Accuracy': [accuracy_score(y_test, y_predictions_dt)]
            ,'Precision':[precision_score(y_test, y_predictions_dt)]
            ,'Recall':[recall_score(y_test, y_predictions_dt)]
            ,'F1_score':[f1_score(y_test, y_predictions_dt)]}
dt_scores = pd.DataFrame(data=metrices)
dt_scores

Unnamed: 0,Accuracy,Precision,Recall,F1_score
0,0.791304,0.818182,0.792,0.804878
