# Import libraries


In [43]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import os 
import pickle

# Train and Evaluation

## Download and read data

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [8]:
# Read data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pd.read_csv(url, names=names)

In [9]:
dataset.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
print(dataset.groupby('class').size())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [11]:
# split data
array = dataset.values
X = array[:, 0:4]
Y = array[:, 4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [20]:
# Convert label to number
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y_train)
Y_train =  le.transform(Y_train)

le = preprocessing.LabelEncoder()
le.fit(Y_validation)
Y_validation =  le.transform(Y_validation)

In [30]:
# Train 
## LinearRegression
LR_model = LogisticRegression().fit(X_train, Y_train)

## GaussianNB
NB_model = GaussianNB().fit(X_train, Y_train)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# Predict 
LR_result = LR_model.predict(X_validation) 
NB_result = NB_model.predict(X_validation)

In [34]:
# Accuracy
LR_acc = accuracy_score(Y_validation, LR_result)
NB_acc = accuracy_score(Y_validation, NB_result)
print("Acc of LogisticRegression: ", LR_acc)
print("Acc of Naive Bayes: ", NB_acc)

Acc of LogisticRegression:  0.8666666666666667
Acc of Naive Bayes:  0.8333333333333334


In [38]:
# F1 Score 
LR_f1_score = f1_score(Y_validation, LR_result, average=None)
NB_f1_score = f1_score(Y_validation, NB_result, average=None)
print("F1-Score of LogisticRegression: ", LR_f1_score)
print("F1-Score Naive Bayes: ", NB_f1_score)

F1-Score of LogisticRegression:  [1.         0.83333333 0.81818182]
F1-Score Naive Bayes:  [1.        0.7826087 0.7826087]


In [42]:
# confusion_matrix 
LR_confusion_matrix = confusion_matrix(Y_validation, LR_result)
NB_confusion_matrix = confusion_matrix(Y_validation, NB_result)
print("Confusion_matrix of LogisticRegression: \n", LR_confusion_matrix)
print("Confusion_matrix Naive Bayes: \n", NB_confusion_matrix)

Confusion_matrix of LogisticRegression: 
 [[ 7  0  0]
 [ 0 10  2]
 [ 0  2  9]]
Confusion_matrix Naive Bayes: 
 [[7 0 0]
 [0 9 3]
 [0 2 9]]


In [44]:
#=> Logistic Regression is the best model
# file name, I'm using *.pickle as a file extension
filename = "logistic_regression.pickle"

# save model
pickle.dump(LR_model, open(filename, "wb"))

# Prediction

In [45]:
def predict(model_path, train):
  loaded_model = pickle.load(open(filename, "rb"))
  y_predicted = loaded_model.predict(train)
  print("Prediction: ", y_predicted)

In [46]:
predict(filename, X_validation)

Prediction:  [2 1 0 1 1 0 1 1 0 1 2 1 0 2 0 2 2 2 0 0 1 2 1 1 2 2 1 1 2 2]
