In [141]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [142]:
# Load the databse

df = pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [143]:
# The "ID" column does not provide much info, drop the "ID" column
# Check missing values
df.drop(["Id"],axis=1,inplace=True)
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [144]:
# Encode the dependent variable such that setosa takes 0 and non setosa takes 1
label_encoder = LabelEncoder()
df["Species_encoded"] = label_encoder.fit_transform(df["Species"])
df["Species_encoded"] = df["Species_encoded"].replace({2:1})
df["Species_encoded"].unique()

array([0, 1])

In [145]:
# Define the dependent variable and independent variables
X = pd.DataFrame(data = df, columns=["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"])
y = df["Species_encoded"]

In [146]:
# Check the shape of the data frame
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [147]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                y, test_size=0.20, random_state=42)

In [148]:
# Fit a model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on test data
y_pred = log_reg.predict(X_test).reshape(-1,1)

In [149]:
# Generate a confusion matrix
from sklearn.metrics import confusion_matrix

classes = ["setosa","non-setosa"]
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm_df

Unnamed: 0,setosa,non-setosa
setosa,10,0
non-setosa,0,20


Precision  = TP / (TP + FP), Recall = TP / (TP + FN)
Precision is to measure out of all the results categorised by the model as "positive", what percentage is true positive
Recall is to measure out of all the true positive results, what percentage has the model got right
Usually, there's a trade-off between Precision and Recall, however, in this case, the model has got everything correct so it is
likely to have a high precision (100%) and a high recall(100%).

In [163]:
# Write own code to calculate the accuracy, precision and recall

TP=0
TN=0
FP=0
FN=0

for i,j in zip(np.array(y_pred),np.array(y_test)):
    if i==0:
        if i == j:
            TP +=1
        else:
            FP +=1
    else:
        if i ==j:
            TN +=1
        else:
            FN +=1

accuracy = (TP+TN)/(TP+TN+FP+FN) 
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print("Manual Accuracy: ",accuracy)
print("Manual Precision: ",precision)
print("Manual Recall: ",recall)

# Check whether the manual calculations are correct

from sklearn.metrics import accuracy_score,precision_score, recall_score
accu = accuracy_score(y_test == classes.index('setosa'), y_pred == classes.index('setosa'))
prec = precision_score(y_test == classes.index('setosa'), y_pred == classes.index('setosa'))
rec = recall_score(y_test == classes.index('setosa'), y_pred == classes.index('setosa'))

print('Accuracy:', accu)
print('Precision:', prec)
print('Recall:', rec)

Manual Accuracy:  1.0
Manual Precision:  1.0
Manual Recall:  1.0
Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [166]:
# Optional exercise
# Encode the dependent variable such that setosa takes 0 and the other species take 1 and 2
df["Species_encoded_op"] = label_encoder.fit_transform(df["Species"])
df["Species_encoded_op"].unique()

# Define the new dependent variable 

y_op = df["Species_encoded_op"]

# Split the data into a training and test set
X_train_op, X_test_op, y_train_op, y_test_op = train_test_split(X, 
                y_op, test_size=0.20, random_state=42)

# Fit a model
log_reg.fit(X_train_op, y_train_op)

# Make predictions on test data
y_pred_op = log_reg.predict(X_test_op).reshape(-1,1)

# Generate a confusion matrix

classes_op = ["setosa","versicolour","virginica"]
conf_mat_op = confusion_matrix(y_test_op, y_pred_op)
cm_df_op = pd.DataFrame(conf_mat_op, columns=classes_op, index=classes_op)
cm_df_op

Unnamed: 0,setosa,versicolour,virginica
setosa,10,0,0
versicolour,0,9,0
virginica,0,0,11


The model still does pretty well on the three-class problem, as it correctly classified each of the three species