In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn import svm
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### With PCA

In [4]:
features = df.copy()
features.drop(labels=['Outcome'],axis=1,inplace=True)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(features)
scaled_features = scaler.transform(features)


In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=6)

In [7]:
pca.fit(scaled_features)

PCA(n_components=6)

In [8]:
features_pca = pca.fit_transform(scaled_features)
features_pca

array([[ 1.06850273,  1.23489499,  0.09592984,  0.4969902 , -0.10998491,
         0.35718251],
       [-1.12168331, -0.73385167, -0.71293816,  0.28505622, -0.38950719,
        -0.40632934],
       [-0.39647671,  1.59587594,  1.76067844, -0.07039464,  0.90647385,
        -0.04001752],
       ...,
       [-0.28347525,  0.09706503, -0.07719194, -0.68756106, -0.52300926,
        -0.53826993],
       [-1.06032431,  0.83706234,  0.42503045, -0.20449292,  0.95759303,
         0.15330712],
       [-0.83989172, -1.15175485, -1.00917817,  0.0869288 , -0.08265082,
        -0.15009639]])

In [9]:
features_pca = pd.DataFrame(features_pca)
features_pca

Unnamed: 0,0,1,2,3,4,5
0,1.068503,1.234895,0.095930,0.496990,-0.109985,0.357183
1,-1.121683,-0.733852,-0.712938,0.285056,-0.389507,-0.406329
2,-0.396477,1.595876,1.760678,-0.070395,0.906474,-0.040018
3,-1.115781,-1.271241,-0.663729,-0.579123,-0.356060,-0.412520
4,2.359334,-2.184819,2.963107,4.033099,0.592684,1.078341
...,...,...,...,...,...,...
763,1.562085,1.923150,-0.867408,-0.390926,-2.541527,-0.077322
764,-0.100405,-0.614181,-0.764353,-0.134859,0.499290,0.529339
765,-0.283475,0.097065,-0.077192,-0.687561,-0.523009,-0.538270
766,-1.060324,0.837062,0.425030,-0.204493,0.957593,0.153307


In [10]:
target = df[['Outcome']]
target

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_pca, target, test_size=0.30, random_state=42)

In [12]:
model = clf = svm.SVC()
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC()

In [13]:
model.score(X_train,y_train)

0.8249534450651769

In [14]:
accuracy_score(model.predict(X_test),y_test)

0.7445887445887446

In [15]:
f1_score(model.predict(X_test),y_test)

0.609271523178808

### Without PCA

In [16]:
features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [17]:
target

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=42)

In [19]:
model_1 = clf = svm.SVC()
model_1.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC()

In [20]:
model_1.score(X_train,y_train)

0.7802607076350093

In [21]:
accuracy_score(model_1.predict(X_test),y_test)

0.7359307359307359

In [22]:
f1_score(model_1.predict(X_test),y_test)

0.5611510791366906

#### Results: Training accuracy , test accuracy and f1 score improves while using PCA