In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Read data

In [13]:
data = pd.read_csv("diabetes.csv")

In [14]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [16]:
y = data['Outcome']

In [17]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [18]:
X = data.drop("Outcome", axis=1)   # all columns except 'Outcome'

In [20]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
# help(train_test_split)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

In [24]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
657,1,120,80,48,200,38.9,1.162,41
125,1,88,30,42,99,55.0,0.496,26
184,4,141,74,0,0,27.6,0.244,40
210,2,81,60,22,0,27.7,0.290,25
147,2,106,64,35,119,30.5,1.400,34
...,...,...,...,...,...,...,...,...
113,4,76,62,0,0,34.0,0.391,25
556,1,97,70,40,0,38.1,0.218,30
586,8,143,66,0,0,34.9,0.129,41
648,11,136,84,35,130,28.3,0.260,42


In [25]:
y_test.value_counts()

Outcome
0    165
1     89
Name: count, dtype: int64

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
logistic_reg = LogisticRegression(max_iter=2000)

In [28]:
model = logistic_reg.fit(X_train, y_train)

In [29]:
# help(logistic_reg)

In [30]:
y_predicted = model.predict(X_test)

In [31]:
y_predicted #predicted value

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [32]:
y_test #acutal value to be compared with prediction

373    0
164    1
415    1
477    0
75     0
      ..
71     0
672    0
124    1
763    0
437    0
Name: Outcome, Length: 254, dtype: int64

In [33]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [34]:
accuracy = accuracy_score(y_predicted, y_test)

In [35]:
accuracy

0.7440944881889764

In [36]:
report = classification_report(y_predicted, y_test)

In [42]:
print(report)

              precision    recall  f1-score   support

           0       0.85      0.77      0.81       182
           1       0.54      0.67      0.60        72

    accuracy                           0.74       254
   macro avg       0.70      0.72      0.70       254
weighted avg       0.77      0.74      0.75       254



In [43]:
confusion_mat = confusion_matrix(y_predicted, y_test)

In [44]:
confusion_mat

array([[141,  41],
       [ 24,  48]], dtype=int64)