In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [4]:
diabetes_dataset = pd.read_csv('/content/diabetes_prediction.csv') 

In [5]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [6]:
diabetes_dataset.shape

(100000, 9)

In [7]:
diabetes_dataset['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [8]:
diabetes_dataset.groupby('diabetes').mean()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.408645,40.115187,0.058984,0.029235,0.175956,26.887163,5.396761,132.85247
1,0.475176,60.946588,0.245647,0.149059,0.298588,31.988382,6.934953,194.094706


In [9]:
X = diabetes_dataset.drop(columns = 'diabetes', axis=1)
Y = diabetes_dataset['diabetes']

In [10]:
print(X)

       gender   age  hypertension  heart_disease  smoking_history    bmi  \
0           0  80.0             0              1                0  25.19   
1           0  54.0             0              0                0  27.32   
2           1  28.0             0              0                0  27.32   
3           0  36.0             0              0                1  23.45   
4           1  76.0             1              1                1  20.14   
...       ...   ...           ...            ...              ...    ...   
99995       0  80.0             0              0                0  27.32   
99996       0   2.0             0              0                0  17.37   
99997       1  66.0             0              0                1  27.83   
99998       0  24.0             0              0                0  35.42   
99999       0  57.0             0              0                1  22.43   

       HbA1c_level  blood_glucose_level  
0              6.6                  140  
1  

In [11]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [13]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 8) (80000, 8) (20000, 8)


In [14]:
classifier = svm.SVC(kernel='linear')

In [15]:
classifier.fit(X_train, Y_train)

In [16]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [17]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9597125


In [18]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [19]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9601


In [20]:
input_data = (1,23.0, 0,0,0,22.9,5.4, 108)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic




In [21]:
import pickle

In [22]:
filename = 'new_diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [24]:
loaded_model = pickle.load(open('new_diabetes_model.sav', 'rb'))

In [25]:
for column in X.columns:
  print(column)

gender
age
hypertension
heart_disease
smoking_history
bmi
HbA1c_level
blood_glucose_level
