In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

In [4]:
dataset=pd.read_csv('diabetes_prediction_dataset.csv.zip')

In [5]:
dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [6]:
dataset.shape

(100000, 9)

In [7]:
dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [8]:
dataset['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [9]:
legit=dataset[dataset.diabetes==0]
fraud=dataset[dataset.diabetes==1]

In [10]:
legit_sample=legit.sample(n=8500)

In [11]:
new_dataset=pd.concat([legit_sample,fraud],axis=0) # row-wise
print(new_dataset.shape)

(17000, 9)


In [12]:
le = LabelEncoder()
# Fit the encoder to the 'gender' column and transform it
new_dataset['gender'] = le.fit_transform(new_dataset['gender'])
# 0-Female
# 1-Male


In [13]:
new_dataset['smoking_history'] = le.fit_transform(new_dataset['smoking_history'])
# 0-No info
# 1-current
# 2-ever
# 3-former
# 4-never
# 5-not currently

In [14]:
new_dataset.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [15]:
# Now, recreate X and Y after transforming 'gender'
X = new_dataset.drop(columns='diabetes', axis=1)
Y = new_dataset['diabetes']
print(X)
print(Y)

       gender   age  hypertension  heart_disease  smoking_history    bmi  \
72412       1  36.0             1              0                1  23.54   
68456       0  38.0             0              0                3  27.32   
42917       1  47.0             0              0                4  28.06   
8308        1  34.0             0              0                3  27.32   
41573       0  27.0             0              0                4  22.88   
...       ...   ...           ...            ...              ...    ...   
99935       0  65.0             1              1                4  33.55   
99938       1  55.0             0              1                3  30.42   
99957       0  61.0             0              0                0  34.45   
99962       0  58.0             1              0                4  38.31   
99979       0  61.0             0              0                1  30.11   

       HbA1c_level  blood_glucose_level  
72412          5.7                  200  
684

In [16]:
# Proceed with scaling
scaler = StandardScaler()
scaler.fit(X.values)
std_data = scaler.transform(X.values)

In [17]:
print(std_data)

[[ 1.13274248 -0.68277682  2.35553311 ... -0.8034757  -0.36865443
   0.64181605]
 [-0.88197003 -0.58939287 -0.42453235 ... -0.29436044 -1.69732937
  -0.32461561]
 [ 1.13274248 -0.16916512 -0.42453235 ... -0.19469238 -0.13418238
   0.64181605]
 ...
 [-0.88197003  0.48452248 -0.42453235 ...  0.66595484  0.25660437
   2.04753484]
 [-0.88197003  0.34444657  2.35553311 ...  1.18584502  0.64739112
   0.64181605]
 [-0.88197003  0.48452248 -0.42453235 ...  0.0814151   0.02213232
   1.34467545]]


In [18]:
X_train,X_test,Y_train,Y_test=train_test_split(std_data,Y,test_size=0.2,stratify=Y,random_state=2)

In [19]:
print(  X_train.shape, X_test.shape, X.shape)
print(  Y_train.shape, Y_test.shape, Y.shape)

(13600, 8) (3400, 8) (17000, 8)
(13600,) (3400,) (17000,)


In [20]:
classifier=svm.SVC(kernel='linear') #defining d model

In [21]:
classifier.fit(X_train,Y_train) #model is trained

In [22]:
X_train_prediction=classifier.predict(X_train) #prediction by trained model
training_data_accuracy=accuracy_score(X_train_prediction,Y_train) #accuracy score
X_test_prediction=classifier.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [23]:
print(training_data_accuracy)
print(test_data_accuracy)

0.8871323529411764
0.8820588235294118


In [24]:
ip=(0,23.0,0,0,4,27.02,5.6,159)
ip=np.asarray(ip)
ip_reshaped=ip.reshape(1,-1) #for only 1 data instance
ip_std= scaler.transform(ip_reshaped) #std input data lyk dataset data
print(ip_std)
prediction=classifier.predict(ip_std)
print(prediction)
if(prediction[0]==0):
  print('Non Diabetic')
else:
  print('Diabetic')

[[-0.88197003 -1.28977245 -0.42453235 -0.31063037  0.85894081 -0.33476641
  -0.44681178 -0.07861482]]
[0]
Non Diabetic


In [25]:
#File handling
with open('model2.pkl','wb') as files: 
    pickle.dump(classifier,files)

In [26]:
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)