Memuat Library

In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler  # Import StandardScaler for data scaling
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

Mencetak Sample

In [21]:
# Baca data
Data = pd.read_csv("diabetes.csv")

# Hapus baris dengan nilai yang hilang
Data = Data.dropna()

# Buat salinan data
M_Data = Data.copy()  # Menggunakan `.copy()` untuk membuat salinan

# Ekstrak variabel outcome
Outcome = M_Data['Outcome']

# Hapus variabel outcome dari data
M_Data.drop('Outcome', axis=1, inplace=True)

# Pisahkan data menjadi sampel positif dan negatif
Positives = Data[Data['Outcome'] == 1]
Negatives = Data[Data['Outcome'] == 0]

# Cetak sampel data
Data.sample(frac=0.1).head(n=5)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
483,0,84,82,31,125,38.2,0.233,23,0
113,4,76,62,0,0,34.0,0.391,25,0
210,2,81,60,22,0,27.7,0.29,25,0
212,7,179,95,31,0,34.2,0.164,60,0
240,1,91,64,24,0,29.2,0.192,21,0


In [20]:
# Deskripsikan data
Data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Buat gambar dengan tiga subplot
fig, ax = plt.subplots(1, 3)

# Plot distribusi fitur Pregnancies untuk sampel positif
sns.distplot(Positives['Pregnancies'], rug=True, kde=False, color='r', fit=stats.gamma, ax=ax[0])

# Plot distribusi fitur BloodPressure untuk sampel positif
sns.distplot(Positives['BloodPressure'], rug=True, kde=False, color='r', fit=stats.gamma, ax=ax[1])

# Plot distribusi fitur Age untuk sampel positif
sns.distplot(Positives['Age'], rug=True, kde=False, color='r', fit=stats.gamma, ax=ax[2])

# Tampilkan gambar
plt.show()

In [None]:
# Buat gambar dengan tiga subplot
fig, ax = plt.subplots(1, 3)

# Plot distribusi fitur Pregnancies untuk sampel negatif
sns.distplot(Negatives['Pregnancies'], rug=True, kde=False, color='g', fit=stats.gamma, ax=ax[0])

# Plot distribusi fitur BloodPressure untuk sampel negatif
sns.distplot(Negatives['BloodPressure'], rug=True, kde=False, color='g', fit=stats.gamma, ax=ax[1])

# Plot distribusi fitur Age untuk sampel negatif
sns.distplot(Negatives['Age'], rug=True, kde=False, color='g', fit=stats.gamma, ax=ax[2])

# Tampilkan gambar
plt.show()

In [None]:
# Hitung matriks korelasi data
Corr = Data.corr()

# Buat heatmap dari matriks korelasi
sns.heatmap(Corr, annot=True)

In [None]:
# Hitung jumlah sampel positif dan negatif
print(Data['Outcome'].value_counts())

# Hitung persentase sampel positif
percent_positive = (Data['Outcome'].sum() / len(Data)) * 100
print("Persentase sampel positif: {:.2f}%".format(percent_positive))

0    500
1    268
Name: Outcome, dtype: int64
Persentase sampel positif: 34.90%


In [None]:
# Buat klasifikasi Support Vector Machine linier
clf = SVC(kernel='linear')

# Cetak akurasi klasifikasi menggunakan cross-validation 5-fold
scores = cross_val_score(clf, M_Data, Outcome, cv=5)
print("Akurasi: {:.2f}%".format(scores.mean() * 100))


Akurasi: 76.57%


In [16]:
# Standarisasi data dengan StandardScaler
scaler = StandardScaler()
M_Data_scaled = scaler.fit_transform(M_Data)

# Latih model SVM dengan data yang telah diskalakan
clf.fit(M_Data_scaled, Outcome)

# Data yang akan diuji (sesuaikan dengan data yang ingin Anda prediksi)
pregnancies = float(input("Jumlah kehamilan: "))
glucose = float(input("Kadar glukosa: "))
blood_pressure = float(input("Tekanan darah: "))
skin_thickness = float(input("Ketebalan kulit: "))
insulin = float(input("Kadar insulin: "))
bmi = float(input("Indeks massa tubuh (BMI): "))
dpf = float(input("Fungsi Silsilah Diabetes: "))
age = float(input("Usia: "))

# Buat DataFrame dari input data
input_data = {
    'Pregnancies': [pregnancies],
    'Glucose': [glucose],
    'BloodPressure': [blood_pressure],
    'SkinThickness': [skin_thickness],
    'Insulin': [insulin],
    'BMI': [bmi],
    'DiabetesPedigreeFunction': [dpf],
    'Age': [age]
}

# Buat DataFrame dari input data
input_df = pd.DataFrame(input_data)

# Standarisasi data dengan StandardScaler
std_data = scaler.transform(input_df)

# Prediksi dengan model SVM
prediction = clf.predict(std_data)

if prediction[0] == 0:
    print('Pasien tidak terkena diabetes')
else:
    print('Pasien terkena diabetes')

Jumlah kehamilan: 5
Kadar glukosa: 158
Tekanan darah: 70
Ketebalan kulit: 0
Kadar insulin: 0
Indeks massa tubuh (BMI): 298
Fungsi Silsilah Diabetes: 207
Usia: 63
Pasien terkena diabetes
