Medical Diagnosis with Support Vector Machines


In [2]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import pandas as pd
import numpy as np


Get Data


In [3]:
column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]

df = pd.read_csv('/content/data.csv', names = column_names)
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Extract Features

In [4]:
X = df.iloc[:,:8]
X.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


Extract Class Labels

In [5]:
y = df['class']
y.head()

0    1
1    0
2    1
3    0
4    1
Name: class, dtype: int64

Split Dataset

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25, 
                                                    random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

X_test.head()

(576, 8)
(192, 8)
(576,)
(192,)


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
661,1,199,76,43,0,42.9,1.394,22
122,2,107,74,30,100,33.6,0.404,23
113,4,76,62,0,0,34.0,0.391,25
14,5,166,72,19,175,25.8,0.587,51
529,0,111,65,0,0,24.6,0.66,31


Normalize Features

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train[:5,:]

array([[ 1.50755225, -1.01521454, -0.40451932, -1.31921491, -0.71823753,
        -1.22070104, -0.98325882, -0.04863985],
       [-0.82986389, -0.09964691, -0.61509602,  0.9287299 ,  0.08374747,
         0.13719053, -0.62493647, -0.88246592],
       [-1.12204091, -0.95207195,  0.54307587, -1.31921491, -0.71823753,
         0.0240329 ,  0.39884168, -0.5489355 ],
       [ 2.38408331,  0.59492164,  0.64836422,  1.36583027,  2.05458297,
         0.87900167,  0.17903049,  2.03592532],
       [ 1.50755225,  0.75277813,  0.54307587,  1.55315901,  0.39089067,
         0.71555175,  0.50724171,  0.53503839]])

Training a Support Vector Machine

In [8]:
clf = svm. SVC(kernel = 'sigmoid')
clf.fit(X_train, y_train)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Decision Boundary

In [9]:
y_pred = clf.predict(X_train)
print(y_pred)
print(accuracy_score(y_train, y_pred))

[0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0
 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0
 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0
 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1
 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1
 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1
 0 0 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1 1
 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0
 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0
 1 0 1 0 1 0 1 0 0 1 0 0 

SVM Kernels

In [10]:
for k in ('linear', 'poly', 'rbf', 'sigmoid'):
    clf= svm.SVC(kernel = k)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    print(k)
    print(accuracy_score(y_train, y_pred))

linear
0.7638888888888888
poly
0.7934027777777778
rbf
0.8246527777777778
sigmoid
0.6510416666666666


Instantiating the Best Model

In [12]:
clf = svm.SVC(kernel= 'rbf')
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Making a single prediction


In [13]:
# "pregnancies", "glucose", "bpressure", 
# "skinfold", "insulin", "bmi", 
# "pedigree", "age", "class"

patient = np.array([[ 1., 200., 75., 100., 0., 45.,1.5, 20. ],])
patient = scaler.transform(patient)
pred = clf.predict(patient)

if pred == 1:
    print('Patient has diabetes')
if pred == 0:
    print('Patient does not have diabetes')

Patient has diabetes


Testing Set Prediction


In [14]:
patient= np.array([X_test.iloc[8,:]])
patient = scaler.transform(patient)
print(clf.predict(patient))
print(y_test.iloc[8])

[1]
0


Accuracy on Testing Set

In [15]:
X_test = scaler.transform(X_test)
y_pred = clf.predict(X_test)
print (accuracy_score(y_test, y_pred))

0.7760416666666666


Comparison to All-Zero Prediction

In [16]:
y_zero = np.zeros(y_test.shape)
print(accuracy_score(y_test, y_zero))

0.6770833333333334


Task 5: Precision and Recall


In [17]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.90      0.84       130
           1       0.71      0.52      0.60        62

    accuracy                           0.78       192
   macro avg       0.75      0.71      0.72       192
weighted avg       0.77      0.78      0.77       192



Demo


In [18]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]
df = pd.read_csv("data.csv", names=column_names)
X = df.iloc[:,:8]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

patient = np.array([[ 1., 200., 75., 40., 0., 45.,1.5, 20. ],])
patient = scaler.transform(patient)
pred = clf.predict(patient)

if pred == 1:
    print("Patient has diabetes")
if pred == 0:
    print("Patient does not have diabetes")

X_test = scaler.transform(X_test)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

Patient has diabetes
0.7760416666666666


Predicting a patient's condition

In [19]:
patient = np.array([[ 1., 200., 75., 40., 0., 45.,1.5, 20. ],])
patient = scaler.transform(patient)
pred = clf.predict(patient)

if pred == 1:
    print("Patient has diabetes")
if pred == 0:
    print("Patient does not have diabetes")

Patient has diabetes
