In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [2]:
df = pd.read_csv('datasets/diabetes.csv')
featured_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

X = df[featured_cols]
y = df.Outcome

In [34]:
x_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [4]:
logReg = LogisticRegression()
logReg.fit(x_train, y_train)

y_pred = logReg.predict(X_test)



# Confusion Matrix - Classifier evaluation

In [8]:
matrix = confusion_matrix(y_test, y_pred)
matrix

array([[118,  12],
       [ 47,  15]])

In [6]:
y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

In [12]:
TN = matrix[0][0] # True negatives
FP = matrix[0][1] # False positives
FN = matrix[1][0] # False negatives
TP = matrix[1][1] # True positives
total = TN + FP + FN + TP 

## Accuracy

In [17]:
accuracy = (TP + TN) / (total)
print(accuracy)
metrics.accuracy_score(y_test, y_pred)

0.6927083333333334


0.6927083333333334

## Recall

In [19]:
recall = TP / (TP + FN)
print(recall)
metrics.recall_score(y_test, y_pred)

0.24193548387096775


0.24193548387096775

## Precision

In [21]:
precision = TP / (TP + FP)
print(precision)
metrics.precision_score(y_test, y_pred)

0.5555555555555556


0.5555555555555556

## F1 Score

In [25]:
f1 = (2 * precision * recall) / (precision + recall)
print(f1)
metrics.f1_score(y_test, y_pred)

0.3370786516853933


0.3370786516853933

# Predict vs predict_proba

In [70]:
print(logReg.predict(X_test[:1]))
y_test[:1]

[0]


661    1
Name: Outcome, dtype: int64

In [72]:
print(logReg.predict_proba(X_test[:1]))

[[0.63247571 0.36752429]]


Predict_proba returns multiple values, which indicate the probablity of which class it falls in.