In [32]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt3
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# importing the dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['Species'] = iris.target
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Identify IV's and DV's

In [33]:
# Independent Variables 
X = data.iloc[:, [0,1,2,3]].values

In [34]:
# Encoding DV's categorical values from [0,1,2] to [0,1]
data.loc[data["Species"] == 2, "Species"] = 1
data['Species'].unique()

array([0, 1])

In [35]:
# Dependent Variable 
y = data.iloc[:, [4]].values

## Training and Test Samples

In [36]:
X = X.reshape(-1, 4)

In [37]:
# scaling the IV's data so it's easier to fit
X = preprocessing.scale(X)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print ("Training data:",X_train.shape, y_train.shape)
print ("Test data:",X_test.shape, y_test.shape)

Training data: (120, 4) (120, 1)
Test data: (30, 4) (30, 1)


## Creating the Logistic Regression Model

In [39]:
# creating an empty logistic regression model object
iris_model = LogisticRegression()

# fitting the model using the training set
model = iris_model.fit(X_train, y_train)

print('Intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Intercept: 
 [2.36149286]
Coefficients: 
 [[ 1.0334783  -1.16302406  1.66774522  1.54620532]]


  y = column_or_1d(y, warn=True)


### Model equations to predict if the Species is Setosa or Not Setosa

$$IsSetosa = 2.36 + 1.03(Sepal Length) - 1.16(Sepal Width) + 1.67(Petal Length) + 1.55(Petal Width)$$

### Predictions for the test set

In [40]:
y_pred = iris_model.predict(X_test).reshape(-1,1)

## Measuring Model Performance

### Confusion Matrix

In [41]:
from sklearn.metrics import confusion_matrix

classes = list(["IsSentora", "NotSentora"])
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm_df

Unnamed: 0,IsSentora,NotSentora
IsSentora,9,0
NotSentora,0,21


As all predictions where accurate, it would be expected for both the precions and recall values to equal to 1 as:

* $Precision = \frac{9}{9 + 0}$
* $Precision = 1$


* $Recall = \frac{9}{9+0}$
* $Recall = 1$

In [42]:
tp = conf_mat[0][0]
fp = conf_mat[0][1]
fn = conf_mat[1][0]
tn = conf_mat[1][1]

### Accuracy 

$$Accuracy = \frac{TP + TN} {TP + FN + TN + FP}$$

In [43]:
accuracy = (tp + tn) / (tp + fn + tn + fp)
print('Accuracy:', accuracy)

Accuracy: 1.0


### Precision

$$Precision = \frac{TP} {TP + FP}$$

In [44]:
precision = tp / (tp + fp)
print('Precision:', precision)

Precision: 1.0


### Recall
$$Recall = \frac{TP} {TP + FN}$$

In [45]:
recall = tp / (tp + fn)
print('Recall:', recall)

Recall: 1.0


The earlier predictions were correct as the model was able to postively predict all values correctly whilst also having the correct level of sensitivity as all postively cases where predicited accurately, thus resulting in an accuracy score of 1 also.

Furthermore, as both the precision and recall are equal to 1 the F1 score is also maximised as it is also equal to 1.


# Optional Task

In [57]:
# import the dataset 
iris = load_iris()
df2 = pd.DataFrame(iris.data, columns=iris.feature_names)
df2['Species'] = iris.target
df2.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Identify IV's and DV's

In [58]:
X2 = df2.iloc[:,[0,1,2,3]].values
y2 = df2.iloc[:,[4]].values

X2 = X2.reshape(-1,4)

## Training and Test Samples

In [59]:
# scaling the data so it is easier to fit
X2 = preprocessing.scale(X2)

In [60]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)

print ("Training data:",X2_train.shape, y2_train.shape)
print ("Test data:",X2_test.shape, y2_test.shape)

Training data: (120, 4) (120, 1)
Test data: (30, 4) (30, 1)


## Creating the Logistic Regression Model

In [61]:
# creating an empty logistic regression model object
iris_model2 = LogisticRegression()

# fitting the model using the training set
model2 = iris_model2.fit(X2_train, y2_train)

print('Intercept: \n', model2.intercept_)
print('Coefficients: \n', model2.coef_)

Intercept: 
 [-0.15416528  1.815195   -1.66102972]
Coefficients: 
 [[-1.0003395   1.25228429 -1.71991383 -1.62129747]
 [ 0.504197   -0.41221389 -0.38699892 -0.78708392]
 [ 0.4961425  -0.8400704   2.10691276  2.40838139]]


  y = column_or_1d(y, warn=True)


### Model equations to predict the Species of the Iris plant

$$Setosa = -0.15 - 1(Sepal Length) + 1.25(Sepal Width) - 1.72(Petal Length) - 1.62(Petal Width)$$

$$Versicolor = 1.82 + 0.5(Sepal Length) - 0.41(Sepal Width) - 0.39(Petal Length) - 0.79(Petal Width)$$

$$Virginica = -1.66 + 0.5(Sepal Length) - 0.84(Sepal Width) + 2.11(Petal Length) + 2.41(Petal Width)$$

### Predictions for the test set

In [62]:
y2_pred = iris_model2.predict(X2_test).reshape(-1,1)

## Measuring Model Performance

### Confusion Matrix

In [63]:
from sklearn.metrics import confusion_matrix

classes2 = list(["Iris-Setosa", "Iris-Versicolor", "Iris-Virginica"])
conf_mat2 = confusion_matrix(y2_test, y2_pred)
cm2_df = pd.DataFrame(conf_mat2, columns=classes2, index=classes2)
cm2_df

Unnamed: 0,Iris-Setosa,Iris-Versicolor,Iris-Virginica
Iris-Setosa,8,0,0
Iris-Versicolor,0,12,2
Iris-Virginica,0,0,8


### Accuracy

Example of calculating the accuracy score using own formula and built-in methods

In [64]:
# accuracy of the Setosa class using formula
tp2 = conf_mat2[0][0]
fp2 = conf_mat2[1][0] + conf_mat2[2][0]
fn2 = conf_mat2[0][1] + conf_mat2[0][2]
tn2 = conf_mat2[1][1] + conf_mat2[1][2] + conf_mat2[2][0] + conf_mat2[2][1]

acc1 = (tp2 + tn2) / (tp2 + fn2 + tn2 + fp2)
print('Accuracy of Iris-Setosa:', acc1)

Accuracy of Iris-Setosa: 1.0


In [65]:
# accuracy using built in methods
# accuracy of predicting Versicolor 
acc2 = accuracy_score(y2_test == classes2.index('Iris-Versicolor'), y2_pred == classes2.index('Iris-Versicolor'))
print('Accuracy of Iris-Versicolor:', acc2)

# accuracy of prediciting Virginica
acc3 = accuracy_score(y2_test == classes2.index('Iris-Virginica'), y2_pred == classes2.index('Iris-Virginica'))
print('Accuracy of Iris-Virginica:', acc3)

Accuracy of Iris-Versicolor: 0.9333333333333333
Accuracy of Iris-Virginica: 0.9333333333333333


### Precision and Recall Scores

Using build-in methods to calculate performace measures

In [66]:
# precision and recall for Setosa class
prec1 = precision_score(y2_test == classes2.index('Iris-Setosa'), y2_pred == classes2.index('Iris-Setosa'))
rec1 = recall_score(y2_test == classes2.index('Iris-Setosa'), y2_pred == classes2.index('Iris-Setosa'))

print('Precision of Iris-Setosa:', prec1)
print('Recall of Iris-Setosa:', rec1)

# precision and recall Versicolor class
prec2 = precision_score(y2_test == classes2.index('Iris-Versicolor'), y2_pred == classes2.index('Iris-Versicolor'))
rec2 = recall_score(y2_test == classes2.index('Iris-Versicolor'), y2_pred == classes2.index('Iris-Versicolor'))

print('\nPrecision of Iris-Versicolor:', prec2)
print('Recall of Iris-Versicolor:', rec2)

# precision and recall Virginica class
prec3 = precision_score(y2_test == classes2.index('Iris-Virginica'), y2_pred == classes2.index('Iris-Virginica'))
rec3 = recall_score(y2_test == classes2.index('Iris-Virginica'), y2_pred == classes2.index('Iris-Virginica'))

print('\nPrecision of Iris-Virginica:', prec2)
print('Recall of Iris-Virginica:', rec2)

Precision of Iris-Setosa: 1.0
Recall of Iris-Setosa: 1.0

Precision of Iris-Versicolor: 1.0
Recall of Iris-Versicolor: 0.8571428571428571

Precision of Iris-Virginica: 1.0
Recall of Iris-Virginica: 0.8571428571428571


### F1-Score

In [67]:
# average f1 score
av_f1 = f1_score(y2_test, y2_pred, average='micro')
print(av_f1)

# f1 score per class
f = f1_score(y2_test, y2_pred, average=None)
lowest_score = min(f)
hardest_class = classes2[list(f).index(lowest_score)]
print('Hardest class:', hardest_class)

0.9333333333333333
Hardest class: Iris-Virginica


With the model now trying to predicit all three types of the Iris plant family from the observed variables, there has been a decrease in accuracy and recall for the plant types Iris-Versicolor and Iris-Virginica. 
The accuracy and recall have decreased as the model appears to have difficult distinguishing between Iris-Versicolor and Iris-Virginica.