In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [3]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('E:\Multiple disease prediction system\dataset-20230622T140539Z-001\diabetes_latest4.csv')

In [4]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  int64  
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   bmi                  100000 non-null  float64
 5   HbA1c_level          100000 non-null  float64
 6   blood_glucose_level  100000 non-null  int64  
 7   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 6.1 MB


In [5]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(100000, 8)

In [6]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.41466,41.885916,0.07486,0.03943,27.324748,5.529545,138.07883,0.085
std,0.493031,22.516853,0.263167,0.194617,6.643409,1.073411,40.729252,0.278883
min,0.0,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,0.0,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,0.0,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,1.0,60.0,0.0,0.0,29.5825,6.2,159.0,0.0
max,2.0,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [7]:
diabetes_dataset['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [8]:
diabetes_dataset.groupby('diabetes').mean()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.409038,40.115187,0.058984,0.029235,26.887163,5.396761,132.85247
1,0.475176,60.947294,0.245765,0.149176,32.035214,6.958929,194.339059


In [9]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'diabetes', axis=1)
Y = diabetes_dataset['diabetes']

In [10]:
print(X)

       gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
0           0  50.0             0              0  19.31          6.5   
1           1  67.0             0              1  27.32          6.5   
2           1  50.0             1              0  27.32          5.7   
3           1  73.0             0              0  25.91          9.0   
4           0  53.0             0              0  27.32          9.0   
...       ...   ...           ...            ...    ...          ...   
99995       0  80.0             0              0  27.32          6.2   
99996       0   2.0             0              0  17.37          6.5   
99997       1  66.0             0              0  27.83          5.7   
99998       0  24.0             0              0  35.42          4.0   
99999       0  57.0             0              0  22.43          6.6   

       blood_glucose_level  
0                      270  
1                      280  
2                      260  
3                  

In [11]:
print(Y)

0        1
1        1
2        1
3        1
4        1
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [13]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 7) (80000, 7) (20000, 7)


Training the model

In [14]:
classifier = svm.SVC(kernel='linear')

In [15]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model evaluation

Accuracy Score

In [16]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)


In [17]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9616625


In [18]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [19]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9617


Making a Predictive System

In [20]:
input_data = (1,50,1,0,27.32,5.7,260)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic




In [21]:
input_data = (0,43,0,0,47.32,7.6,130)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




In [22]:
input_data = (0,59,0,0,45,7,140)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic


