In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/diabetes_latest3.csv')

In [None]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,25.19,6.6,140,0
1,0,54.0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,23.45,5.0,155,0
4,1,76.0,1,1,20.14,4.8,155,0


In [None]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(100000, 8)

In [None]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.41466,41.885916,0.07486,0.03943,27.324074,5.52775,138.06348,0.085
std,0.493031,22.516853,0.263167,0.194617,6.642411,1.071017,40.715847,0.278883
min,0.0,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,0.0,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,0.0,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,1.0,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,2.0,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
diabetes_dataset['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [None]:
diabetes_dataset.groupby('diabetes').mean()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.409038,40.115187,0.058984,0.029235,26.887163,5.396761,132.85247
1,0.475176,60.947294,0.245765,0.149176,32.027286,6.937812,194.158471


In [None]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'diabetes', axis=1)
Y = diabetes_dataset['diabetes']

In [None]:
print(X)

       gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
0           0  80.0             0              1  25.19          6.6   
1           0  54.0             0              0  27.32          6.6   
2           1  28.0             0              0  27.32          5.7   
3           0  36.0             0              0  23.45          5.0   
4           1  76.0             1              1  20.14          4.8   
...       ...   ...           ...            ...    ...          ...   
99995       0  80.0             0              0  27.32          6.2   
99996       0   2.0             0              0  17.37          6.5   
99997       1  66.0             0              0  27.83          5.7   
99998       0  24.0             0              0  35.42          4.0   
99999       0  57.0             0              0  22.43          6.6   

       blood_glucose_level  
0                      140  
1                       80  
2                      158  
3                  

In [None]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 7) (80000, 7) (20000, 7)


Training the model

In [None]:
classifier = svm.SVC(kernel='linear')

In [None]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model evaluation

Accuracy Score

In [None]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)


In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9599625


In [None]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.96075


Making a Predictive System

In [None]:
input_data = (1,50,1,0,27.32,5.7,260)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




In [None]:
input_data = (0,43,0,0,47.32,7.6,130)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




In [None]:
input_data = (1,52,1,0,27.32,8.8,200)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic


