In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load dataset
diabetes_dataset = pd.read_csv('diabetes_2.csv')

In [3]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
# Mengetahui jumlah baris dan atribut pada dataset
diabetes_dataset.shape

(1980, 9)

In [5]:
# Mengetahui jumlah terkena diabates atau tidak terkena diabetes pada dataset
diabetes_dataset['Outcome'].value_counts()


0    1306
1     674
Name: Outcome, dtype: int64

In [6]:
# Pisahkan variabel independen (prediktor) X dan dependen (target) Y
X = diabetes_dataset.drop(columns="Outcome", axis=1)
Y = diabetes_dataset['Outcome']

In [7]:
print(X)

      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0               2      138             62             35        0  33.6   
1               0       84             82             31      125  38.2   
2               0      145              0              0        0  44.2   
3               0      135             68             42      250  42.3   
4               1      139             62             41      480  40.7   
...           ...      ...            ...            ...      ...   ...   
1975            1      109             56             21      135  25.2   
1976            2       88             74             19       53  29.0   
1977            4      151             90             38        0  29.7   
1978            7      102             74             40      105  37.2   
1979            0      114             80             34      285  44.2   

      DiabetesPedigreeFunction  Age  
0                        0.127   47  
1                      

In [8]:
print(Y)

0       1
1       0
2       1
3       1
4       0
       ..
1975    0
1976    0
1977    0
1978    0
1979    0
Name: Outcome, Length: 1980, dtype: int64


In [9]:
# Bagi data menjadi data training dan data testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [10]:
print(X.shape, X_train.shape, X_test.shape)

(1980, 8) (1584, 8) (396, 8)


In [11]:
# Normalisasi skala fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:

# Membuat DataFrame dari data yang sudah dinormalisasi
df_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [13]:
# Menampilkan data training yang sudah dinormalisasi
print("Data Training yang sudah dinormalisasi:")
print(df_train_scaled.head())

Data Training yang sudah dinormalisasi:
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0    -0.509056  0.240284       0.767970      -1.282357 -0.711106 -0.503549   
1     0.427103 -1.344035      -1.071443      -1.282357 -0.711106  0.185463   
2    -1.133161 -0.598473       0.308117       0.136935 -0.711106 -3.888171   
3    -0.197003 -0.318888      -0.356115      -1.282357 -0.711106 -1.156298   
4     1.363261 -0.039302       0.870160      -1.282357 -0.711106 -0.455198   

   DiabetesPedigreeFunction       Age  
0                 -0.567118 -0.506684  
1                  0.541591 -0.678433  
2                  0.295877 -1.021933  
3                 -0.992623 -1.021933  
4                 -0.642031 -0.936058  


In [14]:
# Menampilkan data testing yang sudah dinormalisasi
print("Data Testing yang sudah dinormalisasi:")
print(df_test_scaled.head())

Data Testing yang sudah dinormalisasi:
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     2.611472  0.923715       0.052643       1.185977  1.659196  1.164585   
1     1.051208 -0.598473       0.257022       1.185977  0.207277  0.608540   
2    -0.509056  0.116023      -0.458305      -0.048190  0.513404  0.197551   
3     0.427103 -0.163562       0.257022       0.507185 -0.711106  0.016232   
4     1.363261 -0.940189       0.665781      -1.282357 -0.711106  0.415133   

   DiabetesPedigreeFunction       Age  
0                  0.805284  0.437940  
1                 -0.806839  1.039065  
2                 -1.154435 -0.163184  
3                  0.559570  0.180316  
4                  0.340825  3.014188  


In [15]:
# Inisialisasi model SVM
svm_model = SVC(kernel='rbf')

In [16]:
# Definisikan parameter yang akan diuji dalam GridSearchCV
parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001
]}


In [17]:
# Inisialisasi GridSearchCV dengan model SVM dan parameter yang didefinisikan sebelumnya
grid_search = GridSearchCV(svm_model, parameters, cv=5)

In [18]:

# Latih model dengan menggunakan GridSearchCV
grid_search.fit(X_train_scaled, Y_train)

In [19]:
# Cetak parameter terbaik yang ditemukan oleh GridSearchCV
print("Parameter terbaik:", grid_search.best_params_)

Parameter terbaik: {'C': 10, 'gamma': 1}


In [20]:
# Gunakan parameter terbaik pada model SVM
best_svm_model = SVC(kernel='rbf', C=10, gamma=1)
best_svm_model.fit(X_train_scaled, Y_train)

In [21]:
# Lakukan prediksi pada data testing
Y_pred = best_svm_model.predict(X_test_scaled)

In [22]:
# Mengukur kinerja model
conf_mat = confusion_matrix(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)


In [23]:
# Cetak kinerja model
print("Confussion Matrix:\n", conf_mat)
print("Akurasi:", accuracy)
print("Presisi:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confussion Matrix:
 [[268   8]
 [  2 118]]
Akurasi: 0.9747474747474747
Presisi: 0.9365079365079365
Recall: 0.9833333333333333
F1-score: 0.9593495934959351


In [24]:
# Menghitung akurasi data testing
y_pred_test = best_svm_model.predict(X_test_scaled)
accuracy_test = accuracy_score(Y_test, y_pred_test)
print("Akurasi data testing:", accuracy_test)

Akurasi data testing: 0.9747474747474747


In [25]:
# Menghitung akurasi data training
y_pred_train = best_svm_model.predict(X_train_scaled)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Akurasi data training:", accuracy_train)

Akurasi data training: 1.0


In [26]:
# Fungsi prediksi diabetes
def predict_diabetes(data):
    # Normalisasi skala data
    data_scaled = scaler.transform(np.array(data).reshape(1, -1))
    
    # Lakukan prediksi
    prediction = best_svm_model.predict(data_scaled)
    
    # Kembalikan hasil prediksi
    if prediction[0] == 0:
        return "Pasien tidak terkena diabetes"
    else:
        return "Pasien terkena diabetes"

In [27]:
# Contoh pengujian prediksi dengan satu data
new_data = [2,100,64,23,0,29.7,0.368,21]
result = predict_diabetes(new_data)
# Cetak hasil prediksi
print('Hasil prediksi adalah:')
print(result)

Hasil prediksi adalah:
Pasien tidak terkena diabetes




In [28]:
import pickle

In [29]:
filename = "diabetes_model.sav"
pickle.dump(best_svm_model, open(filename, 'wb'))

In [30]:
# Simpan objek scaler ke dalam file scaler.pkl
scaler_file = "scaler.pkl"
pickle.dump(scaler, open(scaler_file, 'wb'))