# Logistic Regression
Firstly, import the libraries we will use, and open the dataset as a dataframe in pandas.

In [64]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn import preprocessing
import copy
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

iris_df = pd.read_csv('Iris.csv')
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


Independent variables include SepalLength, SepalWidth Petal Length and PetalWidth
Dependent variable is the species, as we learn what species it is by inspecting the other variables.

In [44]:
iris_df['Species'] = iris_df['Species'].astype('category')
iris_df["Y"] = iris_df["Species"].cat.codes
iris_df['Species'].unique()

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [45]:
iris_df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Y
145,146,6.7,3.0,5.2,2.3,Iris-virginica,2
146,147,6.3,2.5,5.0,1.9,Iris-virginica,2
147,148,6.5,3.0,5.2,2.0,Iris-virginica,2
148,149,6.2,3.4,5.4,2.3,Iris-virginica,2
149,150,5.9,3.0,5.1,1.8,Iris-virginica,2


After creating Y. The categoric variable relating to the species of the Iris. We should turn this into a binary column. E.g. it is either Setosa or it is not setosa. So replace all Y = 2 records with Y = 1 and now instead of Y = 1 means veriscolor it now means not setosa.

In [50]:
iris_df['Y'] = iris_df['Y'].replace(2, 1)

In [46]:
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Y
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [51]:
iris_df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Y
145,146,6.7,3.0,5.2,2.3,Iris-virginica,1
146,147,6.3,2.5,5.0,1.9,Iris-virginica,1
147,148,6.5,3.0,5.2,2.0,Iris-virginica,1
148,149,6.2,3.4,5.4,2.3,Iris-virginica,1
149,150,5.9,3.0,5.1,1.8,Iris-virginica,1


In [52]:
iris_df['Y'].unique()

array([0, 1], dtype=int8)

Create X and Y variables.

In [53]:
X = iris_df.iloc[:, [1,2,3,4]]
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [54]:
Y = iris_df['Y']
Y

0      0
1      0
2      0
3      0
4      0
      ..
145    1
146    1
147    1
148    1
149    1
Name: Y, Length: 150, dtype: int8

Create training and test data.

In [55]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

In [58]:
print("Training Data: ", x_train.shape, y_train.shape)
print("Test Data: ", x_test.shape, y_test.shape)

Training Data:  (120, 4) (120,)
Test Data:  (30, 4) (30,)


Check for any biases or anomalies in the split (e.g. if y_test contains 90% one value).

In [59]:
print("y_train {}".format(y_train[0:25]))
print("y_test {}".format(y_test[0:25]))


y_train 133    1
85     1
71     1
37     0
101    1
15     0
106    1
88     1
99     1
124    1
111    1
6      0
2      0
149    1
120    1
24     0
72     1
109    1
140    1
40     0
104    1
5      0
11     0
66     1
58     1
Name: Y, dtype: int8
y_test 26     0
60     1
116    1
46     0
41     0
89     1
22     0
48     0
8      0
68     1
20     0
119    1
27     0
87     1
59     1
108    1
125    1
1      0
137    1
107    1
54     1
128    1
126    1
81     1
113    1
Name: Y, dtype: int8


In [60]:
logistic_regression = LogisticRegression()

In [61]:
model = logistic_regression.fit(x_train, y_train)

predictions = model.predict(x_test)

In [62]:
predictions[0:5]

array([0, 1, 1, 0, 0], dtype=int8)

In [66]:
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[12  0]
 [ 0 18]]


In [68]:
print('Accuracy of logistic regression predicting iris setosa species: {:.2f}'.format(logistic_regression.score(x_test, y_test)))

Accuracy of logistic regression classifier on test set: 1.00


##### Predict in a comment about whether this data will have higher precision or recall or the same.

Precision is calculated by: True Positives / (True Positives + False Positives)
This would give us a value of 1.0 looking at the confusion matrix as that tells us we have 0 false positives, so the calculation is now TP/TP which can only ever be 1.

Recall is calculated by: True Positives / (True Positives + False Negatives)
again, our confusion matrix suggests we have 0 false negatives so a recall value of 1.0. 

Write my own code to calculate accuracy precision and recall.

In [75]:
def precision(TP, FP):
    return (TP/ (TP + FP))
def recall (TP, FN):
    return (TP/ (TP + FN))

TP = confusion_matrix[0][0]
FP = confusion_matrix[0][1]
FN = confusion_matrix[1][0]
TN = confusion_matrix[1][1]

In [77]:
print(f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")

TP: 12, FP: 0, FN: 0, TN: 18


In [78]:
print(f"The precision of this model was: {precision(TP, FP)}.")
print(f"The recall of this model was: {recall(TP, FN)}.")

The precision of this model was: 1.0.
The recall of this model was: 1.0.


## To develop the model further:

Replace the "Yes/No" setosa approach with a 0,1,2, categorical variable just stating which species it is, how does this affect the confusion matrix.

In [98]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn import preprocessing
import copy
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

iris_df = pd.read_csv('Iris.csv')
iris_df.head()

iris_df['Species'] = iris_df['Species'].astype('category')
iris_df["Y"] = iris_df["Species"].cat.codes
iris_df['Species'].unique()

#Create X and Y variables.

X = iris_df.iloc[:, [1,2,3,4]]

Y = iris_df['Y']



#Create training and test data.

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

print("Training Data: ", x_train.shape, y_train.shape)
print("Test Data: ", x_test.shape, y_test.shape)

#Check for biases or anomalies.
print("y_train {}".format(y_train[0:25]))
print("y_test {}".format(y_test[0:25]))

logistic_regression = LogisticRegression(solver='lbfgs', max_iter=1000)

model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

predictions[0:5]

confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)


Training Data:  (120, 4) (120,)
Test Data:  (30, 4) (30,)
y_train 19     0
104    2
2      0
113    2
3      0
120    2
41     0
55     1
148    2
44     0
143    2
39     0
5      0
135    2
78     1
127    2
16     0
1      0
110    2
46     0
116    2
145    2
144    2
24     0
60     1
Name: Y, dtype: int8
y_test 63     1
125    2
33     0
147    2
100    2
140    2
22     0
45     0
82     1
119    2
109    2
59     1
90     1
53     1
112    2
103    2
49     0
77     1
84     1
64     1
105    2
118    2
42     0
139    2
132    2
Name: Y, dtype: int8
[[ 5  0  0]
 [ 0  9  1]
 [ 0  1 14]]


The confusion matrix now has an uncertain result. one value was actually category 1 but was put in category 2. 