In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('../Datasets/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Break data into test & train

In [10]:
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression

pima = pd.read_csv('../Datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

# X is a matrix,access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima['Outcome']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

## Build Logistic Regression model

In [11]:
logreg = LogisticRegression()

# fit model
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## See what the model predicts for y predictions

In [12]:
y_pred = logreg.predict(X_test)

y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Actual values

In [13]:
y_test.values.T

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0])

## Challenge: write a function that counts the following:
- how many 0s in y_pred are 0s in y_test
- how many 1s in y_pred are 1s in y_test
- how many 0s in y_pred are 1s in y_test
- how many 1s in y_pred are 0s in y_test

It should return these four values in that order

This will give us the values for our confusion matrix

In [14]:
def confusion_matrix(predicted_values, test_values):
    actual_zeroes = 0
    actual_ones = 0
    false_zeroes = 0
    false_ones = 0
    for predicted_value, test_value in zip(predicted_values, test_values):
        if predicted_value == 0 and test_value == 0:
            actual_zeroes += 1
        elif predicted_value == 1 and test_value == 1:
            actual_ones += 1
        elif predicted_value == 0 and test_value == 1:
            false_zeroes += 1
        elif predicted_value == 1 and test_value == 0:
            false_ones += 1
    return np.array([[actual_zeroes, false_ones],[false_zeroes, actual_ones]])

confusion_matrix(y_pred, y_test.values.T)

array([[118,  12],
       [ 47,  15]])

## Confusion Matrix

A confusion matrix is like a punnet square from biology.
These describe the real value vs the predicted value.
It us good when the expected is the same as the real value.
This means we like when the points 0,0 and 1,1 are greater.
Optimally we want 1,0 and 0,1 to be 1. This is a perfect model.

|   | 0 | 1 |
|:--|:--|:--|
| 0 |118| 12|
| 1 | 47| 15|


## Computing a confusion matrix

In [15]:
from sklearn import metrics

# using sklearn's metrics we can create a confusion matrix that is given in a 2D array
confusion = metrics.confusion_matrix(y_test, y_pred)

confusion

array([[118,  12],
       [ 47,  15]])

### Labels for a confusion matrix

- True positive - when we expect a positive result and we get one. So if we expect 1 we get 1.
- True negative - the opposite of true positive, that is if we expect 0 we get 0.
- False positive - when we expect a positive result but recieve a negative one. So if we expect 1 and get 0.
- False negative - the opposite of false positives, that is if we expect 0 we get 1.

In [16]:
TP = confusion[1,1] # TRUE POSITIVE
TN = confusion[0,0] # TRUE NEGATIVE
FP = confusion[0,1] # FALSE POSITIVE
FN = confusion[1,0] # FASLE NEGATIVE

In [17]:
# Indexing multidimentional arrays normally is done like so.
arr_2d = [['00','01'],['10','11']]
arr_2d[1][0] # returns '10'

# Numpy arrays offers a cleaner way of array indexing.
np_2d = np.array(arr_2d)
np_2d[1,0] # returns '10'
np_2d[1][0] # also works!

'10'