In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [2]:
# Import Iris dataset
df = pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [20]:
# Encode independent and dependent variables with 0 and 1.
# 0 corresponds to the 'Iris-setosa' class, and 1 corresponds
# to the ‘Not Iris-setosa' class.

df['encoded_species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 1})

print(df)

      Id  SepalLengthCm  ...         Species  encoded_species
0      1            5.1  ...     Iris-setosa                0
1      2            4.9  ...     Iris-setosa                0
2      3            4.7  ...     Iris-setosa                0
3      4            4.6  ...     Iris-setosa                0
4      5            5.0  ...     Iris-setosa                0
..   ...            ...  ...             ...              ...
145  146            6.7  ...  Iris-virginica                1
146  147            6.3  ...  Iris-virginica                1
147  148            6.5  ...  Iris-virginica                1
148  149            6.2  ...  Iris-virginica                1
149  150            5.9  ...  Iris-virginica                1

[150 rows x 7 columns]


In [21]:
# Identify dependent and independent variables then split the data
# into training and test set.

X = df.iloc[:,[1,2,3,4]].values
y = df.iloc[:,6].values

X = X.reshape(-1,4)
X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [22]:
# Fit a logistic regression model

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test
y_pred = log_reg.predict(X_test).reshape(-1,1)

In [23]:
# Confusion matrix
# The model is likely to have similar precision and recall.

classes = ["Iris-setosa", "Not Iris-setosa"]
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm_df

Unnamed: 0,Iris-setosa,Not Iris-setosa
Iris-setosa,13,0
Not Iris-setosa,0,25


In [24]:
# Accuracy, precision, and recall

TP = cm_df.iloc[0,0]
FP = cm_df.iloc[0,1]
FN = cm_df.iloc[1,0]
TN = cm_df.iloc[1,1]

accuracy = (TP + TN)/(TP + FN + TN + FP)
precision = TP/(TP + FP)
recall = TP/(TP + FN)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


## Check assumptions

Precision and Recall are similar, they are both perfect with a value of 1.0. A precision of 1.0 means that every instance predicted as positive is indeed positive and a recall of 1.0 indicates that the model is capturing all instances of the positive class.

# Optional task

In [25]:
# Encode independent and dependent variables with 0 and 1.
# 0, 1, 2 corresponds to the 'Iris-setosa', 'Iris-versicolor' and 'Iris-virginica', respectively.

df['encoded_species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

print(df)

      Id  SepalLengthCm  ...         Species  encoded_species
0      1            5.1  ...     Iris-setosa                0
1      2            4.9  ...     Iris-setosa                0
2      3            4.7  ...     Iris-setosa                0
3      4            4.6  ...     Iris-setosa                0
4      5            5.0  ...     Iris-setosa                0
..   ...            ...  ...             ...              ...
145  146            6.7  ...  Iris-virginica                2
146  147            6.3  ...  Iris-virginica                2
147  148            6.5  ...  Iris-virginica                2
148  149            6.2  ...  Iris-virginica                2
149  150            5.9  ...  Iris-virginica                2

[150 rows x 7 columns]


In [26]:
# Identify dependent and independent variables then split the data
# into training and test set.

X = df.iloc[:,[1,2,3,4]].values
y = df.iloc[:,6].values

X = X.reshape(-1,4)
X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [27]:
# Fit a logistic regression model

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test
y_pred = log_reg.predict(X_test).reshape(-1,1)

In [29]:
# Confusion matrix
# The model is likely to have similar precision and recall.

classes = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm_df

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,13,0,0
Iris-versicolor,0,15,1
Iris-virginica,0,0,9


## Observation from the confusion matrix

The difference between the two matrices in the number of classes. The first 2 by 2 matrix seems to have summed all the values on the second and third row to get a True Negative of 25. This indicates that all instances of "Iris-versicolor" and "Iris-virginica" were considered as negatives for the class "Iris-setosa". The 3 by 3 matrix shows the split of that sum with one instance of the Iris-versicolor being misclassified. 