## library yang akan digunakan

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## load dataset

In [2]:
jantung_data = pd.read_csv('framingham.csv')

In [4]:
jantung_data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [6]:
numeric_features = jantung_data.drop(columns='TenYearCHD')

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

In [8]:
columns_to_drop = ['education', 'cigsPerDay', 'BPMeds', 'sysBP', 'diaBP', 'BMI']
data_baru = jantung_data.drop(columns=columns_to_drop)

print("\nDataframe setelah menghapus kolom:\n", data_baru)


Dataframe setelah menghapus kolom:
       male  age  currentSmoker  prevalentStroke  prevalentHyp  diabetes  \
0        1   39              0                0             0         0   
1        0   46              0                0             0         0   
2        1   48              1                0             0         0   
3        0   61              1                0             1         0   
4        0   46              1                0             0         0   
...    ...  ...            ...              ...           ...       ...   
4233     1   50              1                0             1         0   
4234     1   51              1                0             0         0   
4235     0   48              1                0             0         0   
4236     0   44              1                0             0         0   
4237     0   52              0                0             0         0   

      totChol  heartRate  glucose  TenYearCHD  
0       195.0 

##mengatasi nilai NaN

## memisahkan data atribut dengan label

In [9]:
X = data_baru.drop(columns='TenYearCHD', axis=1)
Y = data_baru["TenYearCHD"]

In [10]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [11]:
print(X)

[[  1.          39.           0.         ... 195.          80.
   77.        ]
 [  0.          46.           0.         ... 250.          95.
   76.        ]
 [  1.          48.           1.         ... 245.          75.
   70.        ]
 ...
 [  0.          48.           1.         ... 248.          84.
   86.        ]
 [  0.          44.           1.         ... 210.          86.
   81.96675325]
 [  0.          52.           0.         ... 269.          80.
  107.        ]]


In [12]:
print (Y)

0       0
1       0
2       0
3       1
4       0
       ..
4233    1
4234    0
4235    0
4236    0
4237    0
Name: TenYearCHD, Length: 4238, dtype: int64


## model data training dan data testing

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=2)

In [14]:
print(X.shape, X_train.shape, X_test.shape)

(4238, 9) (3814, 9) (424, 9)


## membuat model training 

In [15]:
model = LogisticRegression()

In [16]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## evaluasi model

In [17]:
X_train_prediction = model.predict(X_train)
training_data_accurary = accuracy_score(X_train_prediction, Y_train)

In [18]:
print ('akurasi data training :', training_data_accurary)

akurasi data training : 0.8458311484006292


In [19]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [20]:
print('akurasi data testing :', test_data_accuracy)

akurasi data testing : 0.8325471698113207


## model prediksi (tes prediksi)

In [21]:
input_data =(1, 39, 0, 0, 0, 0, 195, 80, 77)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)

print(prediction)
if prediction[0]==0:
    print('pasien tidak terkena penyakit jantung')
else:
    print('pasien terkena penyakit jantung')

[0]
pasien tidak terkena penyakit jantung


## simpan model

In [22]:
import pickle

In [23]:
filename = 'penyakit_jantung.sav'
pickle.dump(model, open(filename, 'wb'))