# Importing the required Dependencies

In [1]:
import pandas as pd
import numpy as np
#importing standarized dependency to standarized data in common range
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split

In [2]:
# reading and showing sample of the csv file
df = pd.read_csv('Data.csv') 
df.sample(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1063,1,181,78,42,293,40.0,1.258,22,1
701,6,125,78,31,0,27.6,0.565,49,1
464,10,115,98,0,0,24.0,1.022,34,0
1639,5,116,74,29,0,32.3,0.66,35,1
1433,2,100,70,52,57,40.5,0.677,25,0
1422,4,123,62,0,0,32.0,0.226,35,1
1797,2,93,64,32,160,38.0,0.674,23,1
65,5,99,74,27,0,29.0,0.203,32,0
1187,3,130,78,23,79,28.4,0.323,34,1
273,1,71,78,50,45,33.2,0.422,21,0


In [3]:
# describing the data as statistical data as mean / standard deviation
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,121.1825,69.1455,20.935,80.254,32.193,0.47093,33.0905,0.342
std,3.306063,32.068636,19.188315,16.103243,111.180534,8.149901,0.323553,11.786423,0.474498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.5,0.0,0.0,27.375,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,40.0,32.3,0.376,29.0,0.0
75%,6.0,141.0,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,199.0,122.0,110.0,744.0,80.6,2.42,81.0,1.0


In [4]:
# checking if data has a null value
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2000 non-null   int64  
 1   Glucose                   2000 non-null   int64  
 2   BloodPressure             2000 non-null   int64  
 3   SkinThickness             2000 non-null   int64  
 4   Insulin                   2000 non-null   int64  
 5   BMI                       2000 non-null   float64
 6   DiabetesPedigreeFunction  2000 non-null   float64
 7   Age                       2000 non-null   int64  
 8   Outcome                   2000 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 140.8 KB


In [5]:
# printing shape of dataset as rows and columns
df.shape


(2000, 9)

In [6]:
# separating the data and Outcome 
X = df.drop(columns = 'Outcome',axis=1)
Y = df['Outcome']
print(X)
print(Y)

      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0               2      138             62             35        0  33.6   
1               0       84             82             31      125  38.2   
2               0      145              0              0        0  44.2   
3               0      135             68             42      250  42.3   
4               1      139             62             41      480  40.7   
...           ...      ...            ...            ...      ...   ...   
1995            2       75             64             24       55  29.7   
1996            8      179             72             42      130  32.7   
1997            6       85             78              0        0  31.2   
1998            0      129            110             46      130  67.1   
1999            2       81             72             15       76  30.1   

      DiabetesPedigreeFunction  Age  
0                        0.127   47  
1                      

In [7]:
# checking outcomes
Y.unique()

array([1, 0])

In [8]:
# counting the outcomes ("0" as -negative and "1" as +positive)
df['Outcome'].value_counts()

0    1316
1     684
Name: Outcome, dtype: int64

In [9]:
# getting mean for label 0 and 1 of all attributes
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.168693,110.586626,68.094985,20.052432,70.56383,30.567477,0.434676,31.081307
1,4.732456,141.568713,71.166667,22.633041,98.897661,35.320468,0.540681,36.95614


# Standardization of the data

In [10]:
scaled_data = StandardScaler()
standardized_data = scaled_data.fit_transform(X)
print(standardized_data)

[[-0.5153943   0.52455322 -0.37248123 ...  0.17268332 -1.06324616
   1.18042417]
 [-1.12049474 -1.1597562   0.67008046 ...  0.73724853 -0.7355513
  -0.85632626]
 [-1.12049474  0.74288962 -3.60442246 ...  1.47363794  0.49175869
  -0.17740945]
 ...
 [ 0.69480658 -1.12856529  0.46156812 ... -0.12187245 -0.27492362
   0.75610116]
 [-1.12049474  0.24383498  2.12966682 ...  4.28419085 -0.46968566
  -0.60173245]
 [-0.5153943  -1.25332895  0.14879962 ... -0.25687717  0.23516743
  -0.68659705]]


In [11]:
X = standardized_data

In [12]:
print(X)

[[-0.5153943   0.52455322 -0.37248123 ...  0.17268332 -1.06324616
   1.18042417]
 [-1.12049474 -1.1597562   0.67008046 ...  0.73724853 -0.7355513
  -0.85632626]
 [-1.12049474  0.74288962 -3.60442246 ...  1.47363794  0.49175869
  -0.17740945]
 ...
 [ 0.69480658 -1.12856529  0.46156812 ... -0.12187245 -0.27492362
   0.75610116]
 [-1.12049474  0.24383498  2.12966682 ...  4.28419085 -0.46968566
  -0.60173245]
 [-0.5153943  -1.25332895  0.14879962 ... -0.25687717  0.23516743
  -0.68659705]]


# Splitting to Train Test And Split

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)
print(X.shape,  X_test.shape, X_train.shape)

(2000, 8) (400, 8) (1600, 8)


In [14]:
svmclassifier = svm.SVC(kernel='linear')

#training the support vector Machine Classifier
svmclassifier.fit(X_train, Y_train)

SVC(kernel='linear')

# Evaluation

In [15]:
X_train_prediction = svmclassifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('The accuracy score is: ', training_data_accuracy)

The accuracy score is:  0.775


In [16]:

X_test_prediction = svmclassifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [17]:
print('The accuracy score is: ', test_data_accuracy)

The accuracy score is:  0.805


# Final

In [18]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0


In [None]:
repeat=True
while repeat==True:
    reply=input("Type your gender(Type male or female.)")
    if reply.upper()=="FEMALE":        
        Preg=input("Enter Number of Pregnancies: ")
        Glucose=input("Enter the value of Glucose: ")
        BP=input("Enter the value of Blood Pressure(BP): ")
        ST=input("Enter the value of Skin Thickness: ")
        I=input("Enter the value of Insulin: ")
        BMI=input("Enter the value of Body Mass Index(BMI): ")
        DPF=input("Enter the value of Diabetes Pedigree Function: ")
        Age=input("Enter the value of Age: ")
        input_data=[Preg,Glucose,BP,ST,I,BMI,DPF,Age]
        # changing the input_data to numpy array
        input_data_as_array = np.asarray(input_data)
        # reshape the array as we are predicting for one instance
        reshaped_data = np.array(input_data).reshape(1,-1)
    
        # standardize the input data
        final_data = scaled_data.transform(reshaped_data)
        print(final_data)

        prediction = svmclassifier.predict(final_data)
        print(prediction)

        if (prediction[0] == 0):
          print('Not diabetic')
        else:
          print('Diabetic')
          break
    elif reply.upper()=="MALE":
        print("This system is only to predict Diabetes for women!")



