Logistic Regression Evaluation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set # Set the default sns theme for all matplotlib graphics 
import sklearn

In [2]:
from sklearn.datasets import load_iris

In [3]:
dataset = load_iris()

In [4]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [5]:
dataset.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
# data - input data
# target - output data 
# target_names - output flower names 
# feature_names - input column names
# file - CSV 

In [7]:
df = pd.DataFrame(dataset.data, columns= dataset.feature_names)

In [8]:
df["target"]= dataset.target

In [9]:
df.sample(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
19,5.1,3.8,1.5,0.3,0
63,6.1,2.9,4.7,1.4,1
128,6.4,2.8,5.6,2.1,2
116,6.5,3.0,5.5,1.8,2
135,7.7,3.0,6.1,2.3,2


In [10]:
df.columns = [col.replace(" (cm)", "").replace(" ", "_") for col in df.columns]

In [11]:
df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
98,5.1,2.5,3.0,1.1,1
146,6.3,2.5,5.0,1.9,2
108,6.7,2.5,5.8,1.8,2
43,5.0,3.5,1.6,0.6,0
89,5.5,2.5,4.0,1.3,1


In [12]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=["target"]), df.target, test_size= 0.1)

In [15]:
x_train.shape

(135, 4)

In [16]:
x_test.shape

(15, 4)

In [17]:
y_train.shape

(135,)

In [18]:
y_test.shape

(15,)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
lr = LogisticRegression()

In [21]:
lr.fit(X= x_train, y = y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr.coef_

array([[-0.37960112,  0.94828044, -2.49462095, -1.03914271],
       [ 0.46947976, -0.20956882, -0.18640379, -0.88004838],
       [-0.08987864, -0.73871161,  2.68102475,  1.91919109]])

In [23]:
lr.intercept_

array([  9.4499568 ,   2.09943521, -11.54939201])

In [24]:
df.sample()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
27,5.2,3.5,1.5,0.2,0


In [25]:
lr.predict([[6.1,	2.6,	5.6,	1.4]])

array([2])

In [26]:
dataset.target_names[lr.predict([[6.1,	2.6,	5.6,	1.4]])]

array(['virginica'], dtype='<U10')

In [27]:
dataset.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [28]:
lr.predict_proba([[6.1,	2.6,	5.6,	1.4]])

array([[6.11987121e-05, 1.76546228e-01, 8.23392573e-01]])

Evaluate

In [29]:
y_pred = lr.predict(x_test)

In [30]:
y_pred # Predicted Values

array([0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 2, 2, 0, 0, 2])

In [31]:
y_test.values # Actual Values

array([0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 2, 2, 0, 0, 2])

In [32]:
(y_pred == y_test.values)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [33]:
(y_pred == y_test.values).mean()

1.0

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(y_test, y_pred)

1.0

Confusion Metrics

In [36]:
actual = [1,1,1,1,1,1,1,1,0,0,0,0]
prediction = [0,0,1,1,1,1,1,1,0,0,0,1]

In [39]:
from sklearn.metrics import confusion_matrix, classification_report

In [38]:
confusion_matrix(y_test.values, y_pred)

array([[8, 0, 0],
       [0, 2, 0],
       [0, 0, 5]])

In [40]:
print(classification_report(y_test.values, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



Decision Tree

<img src="https://scikit-learn.org/stable/_images/iris.png"/>

SyntaxError: ignored