In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data=pd.read_csv('iris.data',sep=",",header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
data.columns

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [5]:
data.columns=['sepal_length','sepal_width','petal_length','petal_width','class']

In [6]:
data.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'], dtype='object')

In [7]:
data.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [8]:
data.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

In [9]:
encoder_x=LabelEncoder()
x=data.iloc[:,0:4].values
for i in range(4):
    x[:,i]=encoder_x.fit_transform(x[:,i])
x[:10]

array([[ 8., 14.,  4.,  1.],
       [ 6.,  9.,  4.,  1.],
       [ 4., 11.,  3.,  1.],
       [ 3., 10.,  5.,  1.],
       [ 7., 15.,  4.,  1.],
       [11., 18.,  7.,  3.],
       [ 3., 13.,  4.,  2.],
       [ 7., 13.,  5.,  1.],
       [ 1.,  8.,  4.,  1.],
       [ 6., 10.,  5.,  0.]])

In [10]:
encoder=LabelEncoder()
y=encoder.fit_transform(data['class'])
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [11]:
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [12]:
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([50, 50, 50], dtype=int64))

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### Split the dataset

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2, random_state=8)

In [15]:
x_train[:10]

array([[ 8., 17.,  8.,  3.],
       [ 5.,  9.,  4.,  0.],
       [ 3., 15.,  0.,  1.],
       [24., 12., 33., 17.],
       [12.,  5., 20.,  8.],
       [25.,  7., 24., 10.],
       [18.,  5., 32., 10.],
       [13.,  9., 17.,  9.],
       [12., 21.,  4.,  1.],
       [14.,  8., 18.,  9.]])

In [16]:
y_train[:15]

array([0, 0, 0, 2, 1, 1, 2, 1, 0, 1, 1, 0, 1, 1, 2])

In [17]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
x_train=pca.fit_transform(x_train)
x_test=pca.fit_transform(x_test)

In [18]:
'''#Feature scaling
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.fit_transform(x_test)
Feature scaling reduced the accuracy'''

'#Feature scaling\nfrom sklearn.preprocessing import StandardScaler\nss=StandardScaler()\nx_train=ss.fit_transform(x_train)\nx_test=ss.fit_transform(x_test)\nFeature scaling reduced the accuracy'

### Building a model

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [20]:
%%time
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", 
         "SGD Classifier","Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    GaussianNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    classifier=model
    classifier.fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    print("{}  Accuracy: {}".format(name, accuracy_score(y_test,y_pred)*100))

K Nearest Neighbors  Accuracy: 80.0
Decision Tree  Accuracy: 80.0
Random Forest  Accuracy: 83.33333333333334
Logistic Regression  Accuracy: 80.0
SGD Classifier  Accuracy: 73.33333333333333
Naive Bayes  Accuracy: 96.66666666666667
SVM Linear  Accuracy: 80.0
Wall time: 190 ms


#### Naive bayes works best.

In [21]:
classifier=GaussianNB()
classifier.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
y_pred=classifier.predict(x_test)

In [23]:
accuracy_score(y_test,y_pred)

0.9666666666666667

In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.90      1.00      0.95         9
           2       1.00      0.91      0.95        11

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [25]:
confusion_matrix(y_test,y_pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 10]], dtype=int64)