In [102]:
# Importing the Dependencies
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import *
import warnings
warnings.filterwarnings('ignore')

# svm --> Support Vector Machine 

The datasets consist of several medical predictor (independent) variables and one target (dependent)
variable, Outcome. Independent variables include the number of pregnancies the patient has had,
their BMI, insulin level, age, and so on.

In [103]:
# Importing  PIMA Indians Diabetes Dataset
diabetes=pd.read_csv(r'C:\Users\Pooja gupta\Downloads\diabetes.csv')
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [104]:
# Number of rows and columns
diabetes.shape

(768, 9)

In [105]:
# Fetching information about Data set
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [106]:
# Checking for Null Values
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [107]:
# Checking for Duplicates Vales
diabetes.duplicated().sum()

0

In [108]:
# Dataset col names
col=diabetes.columns
col

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [109]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [110]:
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

0 --> Non Diabetes
1 --> Diabetes

In [111]:
diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [112]:
# Data Splitting into Dependent col or Target col and Independent col
x=diabetes.drop(columns='Outcome',axis=1)
y=diabetes['Outcome']

In [113]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [114]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

Data Standardization

In [115]:
scaler=StandardScaler()
Standard_data=scaler.fit_transform(x)
Standard_data

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [116]:
x=Standard_data
y=diabetes['Outcome']

In [117]:
x

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [118]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [119]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=2)

In [120]:
x_train

array([[-0.84488505,  0.09719231,  0.25303625, ..., -0.53211885,
        -1.12311057, -0.27575966],
       [-0.84488505, -0.81042491,  0.66661825, ...,  0.38169971,
        -0.72143478,  0.83038113],
       [-1.14185152, -0.84172205,  0.04624525, ...,  1.4605133 ,
        -0.37714125, -1.04154944],
       ...,
       [ 0.04601433,  0.12848945,  0.04624525, ..., -0.39250768,
         2.02989333,  1.00055664],
       [-0.25095213, -0.15318486,  0.25303625, ..., -0.72249772,
        -1.10196973, -0.78628618],
       [ 0.04601433, -0.34096773, -0.16054575, ..., -0.01174995,
        -0.00264654, -0.36084741]])

In [121]:
y_train

602    0
429    1
623    0
209    1
589    0
      ..
534    0
584    1
493    1
527    0
168    0
Name: Outcome, Length: 614, dtype: int64

In [122]:
x_test

array([[-0.54791859, -1.02950492,  0.25303625, ..., -0.37981576,
        -0.73351526, -0.95646168],
       [-0.54791859,  0.25367803,  0.77001375, ..., -0.506735  ,
        -0.56740873, -0.53102292],
       [-1.14185152, -0.59134489,  0.45982725, ...,  0.31824009,
        -0.70633419, -0.78628618],
       ...,
       [-1.14185152,  0.34756947,  0.45982725, ...,  0.05170968,
        -0.23821579, -1.04154944],
       [ 0.3429808 ,  0.5040552 ,  2.01075975, ...,  2.1331853 ,
        -0.73955549,  0.31985461],
       [-0.84488505, -1.4989621 , -0.98770975, ..., -1.14133123,
        -0.676133  , -1.04154944]])

In [123]:
y_test

158    0
251    0
631    0
757    1
689    1
      ..
733    0
441    0
627    0
84     1
55     0
Name: Outcome, Length: 154, dtype: int64

In [124]:
print(x.shape,x_train.shape,x_test.shape)

(768, 8) (614, 8) (154, 8)


In [125]:
print(y.shape,y_train.shape,y_test.shape)

(768,) (614,) (154,)


Training Model

In [126]:
svm_model=svm.SVC(kernel='linear')
svm_model.fit(x_train,y_train)

SVC(kernel='linear')

Evaluation

In [127]:
# Accuracy Score on the training dataset
train_pred=svm_model.predict(x_train)
training_accuracy=accuracy_score(train_pred,y_train)

In [128]:
training_accuracy*100

77.19869706840392

In [129]:
# Accuracy Score on test dataset
test_pred=svm_model.predict(x_test)
test_accuracy=accuracy_score(test_pred,y_test)

In [130]:
test_accuracy*100

76.62337662337663

In [131]:
# Dataset is very less therefore we are getting less accuracy.

Making a Predictive System

In [132]:
input_data=(2,197,70,45,543,30.5,0.158,53)

# Changing the input_data to numpy array
input_data_as_numpy_array=np.array(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshape=input_data_as_numpy_array.reshape(1,-1)

# Standardize the input data
std_data=scaler.transform(input_data_reshape)
print(std_data)

prediction=svm_model.predict(std_data)
print(prediction)

if (prediction[0]==0):
    print("Person is not Diabestes")
else:
    print("Person is Diabetes")

[[-0.54791859  2.38188392  0.04624525  1.53455054  4.02192191 -0.18943689
  -0.94794368  1.68125866]]
[1]
Person is Diabetes


Theory of svm:
Support Vector Machine or SVM is one of the most popular Supervised Learning algorithms, which is
used for Classification as well as Regression problems. However, primarily, it is used for 
Classification problems in Machine Learning.

The goal of the SVM algorithm is to create the best line or decision boundary that can segregate
n-dimensional space into classes so that we can easily put the new data point in the correct category
in the future. This best decision boundary is called a hyperplane.

SVM can be of two types:
Linear SVM: Linear SVM is used for linearly separable data, which means if a dataset can be classified
into two classes by using a single straight line, then such data is termed as linearly separable 
data, and classifier is used called as Linear SVM classifier.

Non-linear SVM: Non-Linear SVM is used for non-linearly separated data, which means if a dataset 
cannot be classified by using a straight line, then such data is termed as non-linear data and 
classifier used is called as Non-linear SVM classifier.