In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv(os.path.join("..", "Resources", "diabetes.csv"))
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#  Are there any null values?
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
# What is the distribution of those with and without diabetes?
df.groupby("Outcome").size()

Outcome
0    500
1    268
dtype: int64

In [5]:
# Set the y variable to the "Outcome" column.
y = df["Outcome"]

In [6]:
# Drop the diabetes values and set the X to the remaining data.
X = df.drop("Outcome", axis=1)
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
# Split our data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create a logistic regression model, fit (train) the model, and validate the model.
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.78125
Testing Data Score: 0.7239583333333334


In [9]:
# Create a confusion matrix and print out the confusion matrix for the predicted outcome.
y_true = y_test
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[95, 28],
       [25, 44]])

In [10]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,95,28
Actual 1,25,44


We can see from the confusion matrix that the Logistic Regression was better at predicting non-diabetics than it was at predicting diabetes. Out of 123 individuals without diabetes, 95 were predicted to not have diabetes (recall is 77%), whereas out of 69 individuals with diabetes, only 44 were predicted to have diabetes (recall is 64%). 

Let's calculate the precision, sensitivity and F1 score.

In [11]:
# Create a flattened one-dimensional array of tn, fp, fn, tp with ravel()
tn, fp, fn, tp = cm.ravel()

In [12]:
tn, fp, fn, tp

(95, 28, 25, 44)

In [13]:
# Calculate the precision of the model based on the confusion matrix
precision = tp / (tp + fp)
precision

0.6111111111111112

In [14]:
# Calculate the sensitivity of the model based on the confusion matrix
sensitivity = tp / (tp + fn)
sensitivity

0.6376811594202898

In [15]:
f1 = 2*precision*sensitivity / (precision + sensitivity)
f1

0.624113475177305

In [16]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       123
           1       0.61      0.64      0.62        69

    accuracy                           0.72       192
   macro avg       0.70      0.71      0.70       192
weighted avg       0.73      0.72      0.73       192



Although the number of data points is small, using logistic regression we were able to achieve decent results in predicting that someone will have diabetes in 5 years, or conversely, it was better at predicting that someone will not have diabetes in 5 years. 

The accuracy to predict diabetes in 5 years is 72%. However, with a precision score of 0.61, recall and sensitivity of 0.64, and F1 score of 0.62 the logistic regression model is missing a lot individuals with diabetes. On the other hand, the logistic regression model was a bit better at predicting individuals without diabetes.