### Import Depedencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


### Data Collection dan Processing

In [3]:
# load data
data = pd.read_csv('Dataset/HeartDisease.csv')
data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0


In [4]:
data.tail()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
1020,59,Male,Atypical angina,140,221,Lower than 120 mg/ml,ST-T wave abnormality,164,Yes,0.0,Downsloping,Zero,Fixed Defect,1
1021,60,Male,Typical angina,125,258,Lower than 120 mg/ml,Normal,141,Yes,2.8,Flat,One,Reversable Defect,0
1022,47,Male,Typical angina,110,275,Lower than 120 mg/ml,Normal,118,Yes,1.0,Flat,One,Fixed Defect,0
1023,50,Female,Typical angina,110,254,Lower than 120 mg/ml,Normal,159,No,0.0,Downsloping,Zero,Fixed Defect,1
1024,54,Male,Typical angina,120,188,Lower than 120 mg/ml,ST-T wave abnormality,113,No,1.4,Flat,One,Reversable Defect,0


In [5]:
# cek data kategoric

data['slope'].value_counts()

Flat           482
Downsloping    469
Upsloping       74
Name: slope, dtype: int64

In [6]:
# mengubah data yang sebelumnya kategorical menjadi numerik
data['sex'] = data['sex'].replace({"Male": 1, "Female": 0})
data['chest_pain_type'] = data['chest_pain_type'].replace({"Typical angina": 0, "Non-anginal pain": 1, "Atypical angina": 2, "Asymptomatic": 3 })
data['fasting_blood_sugar'] = data['fasting_blood_sugar'].replace({"Lower than 120 mg/ml": 1, "Greater than 120 mg/ml": 0})
data['rest_ecg'] = data['rest_ecg'].replace({"Left ventricular hypertrophy": 2,"ST-T wave abnormality": 1, "Normal": 0})
data['exercise_induced_angina'] = data['exercise_induced_angina'].replace({"Yes": 1, "No": 0})
data['vessels_colored_by_flourosopy'] = data['vessels_colored_by_flourosopy'].replace({"Zero": 0, "One": 1, "Two": 2,"Three": 3,"Four": 4})
data['thalassemia'] = data['thalassemia'].replace({"No": 0, "Normal": 1, "Reversable Defect": 2,"Fixed Defect": 3})
data['slope'] = data['slope'].replace({"Upsloping": 2,"Flat": 1, "Downsloping": 0})


In [7]:
data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,1,0,125,212,1,1,168,0,1.0,0,2,2,0
1,53,1,0,140,203,0,0,155,1,3.1,2,0,2,0
2,70,1,0,145,174,1,1,125,1,2.6,2,0,2,0
3,61,1,0,148,203,1,1,161,0,0.0,0,1,2,0
4,62,0,0,138,294,0,1,106,0,1.9,1,3,3,0


In [8]:
data.shape

(1025, 14)

In [9]:
# getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1025 non-null   int64  
 1   sex                            1025 non-null   int64  
 2   chest_pain_type                1025 non-null   int64  
 3   resting_blood_pressure         1025 non-null   int64  
 4   cholestoral                    1025 non-null   int64  
 5   fasting_blood_sugar            1025 non-null   int64  
 6   rest_ecg                       1025 non-null   int64  
 7   Max_heart_rate                 1025 non-null   int64  
 8   exercise_induced_angina        1025 non-null   int64  
 9   oldpeak                        1025 non-null   float64
 10  slope                          1025 non-null   int64  
 11  vessels_colored_by_flourosopy  1025 non-null   int64  
 12  thalassemia                    1025 non-null   i

In [10]:
# checking for missing values
data.isnull().sum()

age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

In [11]:
# statistical measure about the data
data.describe()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.828293,131.611707,246.0,0.850732,0.529756,149.114146,0.336585,1.071512,0.614634,0.754146,2.454634,0.513171
std,9.07229,0.460373,0.959013,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.643583,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,1.0,0.0,132.0,0.0,0.0,0.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,1.0,1.0,152.0,0.0,0.8,1.0,0.0,3.0,1.0
75%,61.0,1.0,1.0,140.0,275.0,1.0,1.0,166.0,1.0,1.8,1.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [12]:
# checking the distribution of target variable
data['target'].value_counts()

1    526
0    499
Name: target, dtype: int64

1--> Defective Heart<br>
0--> Healthy Heart

### Splitting Data

In [13]:
x = data.drop(columns='target', axis=1)
y = data['target']

In [14]:
print(x)

      age  sex  chest_pain_type  resting_blood_pressure  cholestoral  \
0      52    1                0                     125          212   
1      53    1                0                     140          203   
2      70    1                0                     145          174   
3      61    1                0                     148          203   
4      62    0                0                     138          294   
...   ...  ...              ...                     ...          ...   
1020   59    1                2                     140          221   
1021   60    1                0                     125          258   
1022   47    1                0                     110          275   
1023   50    0                0                     110          254   
1024   54    1                0                     120          188   

      fasting_blood_sugar  rest_ecg  Max_heart_rate  exercise_induced_angina  \
0                       1         1             168    

In [15]:
print(y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,stratify=y, random_state=2)

In [17]:
print(x.shape, x_train.shape, x_test.shape )
print(y.shape, y_train.shape, y_test.shape )

(1025, 13) (820, 13) (205, 13)
(1025,) (820,) (205,)


### Training Model
### LogisticRegression

In [18]:
model = LogisticRegression()
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation

In [19]:
# accuracy train data
x_train_pred = model.predict(x_train)
train_data_acc = accuracy_score(x_train_pred, y_train)

In [20]:
print('Accuracy : ', train_data_acc)

Accuracy :  0.8378048780487805


In [21]:
# accuracy test data
x_test_pred = model.predict(x_test)
test_data_acc = accuracy_score(x_test_pred, y_test)

In [22]:
print('Accuracy : ', test_data_acc)

Accuracy :  0.8195121951219512


### Predict

In [23]:
data.to_csv('data.csv', index=False)

In [24]:
# input_data = (58,0,0,100,248,1,0,122,0,1.0,1,0,3) #1
input_data = (62,0,0,138,294,0,1,106,0,1.9,1,3,3) #0

# change the input data to a numpy array
input_data_as_numpy = np.array(input_data)

# reshape numpy array as we are predicting for only on instance
input_data_reshape = input_data_as_numpy.reshape(1,-1)

prediksi = model.predict(input_data_reshape)
print(prediksi)

if(prediksi[0] == 0):
    print('Pasien tidak indikasi penyakit jantung')
else:
    print('Pasien terindikasi penyakit jantung')

[0]
Pasien tidak indikasi penyakit jantung




In [25]:
from jcopml.utils import save_model, load_model

In [27]:
save_model(model, "jantung_model.pkl")

Model is pickled as model/jantung_model.pkl


In [29]:
load_model_jantung = load_model('model/jantung_model.pkl')

In [31]:
input_data = (58,0,0,100,248,1,0,122,0,1.0,1,0,3) #1
# input_data = (62,0,0,138,294,0,1,106,0,1.9,1,3,3) #0

# change the input data to a numpy array
input_data_as_numpy = np.array(input_data)

# reshape numpy array as we are predicting for only on instance
input_data_reshape = input_data_as_numpy.reshape(1,-1)

prediksi = load_model_jantung.predict(input_data_reshape)
print(prediksi)

if(prediksi[0] == 0):
    print('Pasien tidak indikasi penyakit jantung')
else:
    print('Pasien terindikasi penyakit jantung')

[1]
Pasien terindikasi penyakit jantung


