In [80]:
# conventional way to import pandas
import pandas as pd
import numpy as np

# read CSV file from the 'data' subdirectory using a relative path
cols=['sepal length','sepal width','petal length','petal width','class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None,names=cols)

In [81]:
df.columns

Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'class'], dtype='object')

In [82]:
features=['sepal length', 'sepal width', 'petal length', 'petal width']

X=df[features]
Y=df['class']

# Splitting X and Y into training and testing sets

In [83]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1,test_size=0.3)

Apply Scaling to train set and test set both Xs

In [84]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)

# 1. Perceptron Model

In [85]:
from sklearn.linear_model import Perceptron
ppn=Perceptron(n_iter=40,eta0=0.1,random_state=0)
ppn.fit(X_train_std,y_train)

Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      max_iter=None, n_iter=40, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

# Make Classification on Test Set

In [86]:
y_pred=ppn.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

Misclassified Samples: 2


In [87]:
from sklearn.metrics import accuracy_score
print('Accuracy of Perceptron Model: %.2f' % accuracy_score(y_test,y_pred))

Accuracy of Perceptron Model: 0.96


In [88]:
mods=[]
vals=[]
mods.append('Perceptron Model')
vals.append(accuracy_score(y_test,y_pred).round(2))

# 2. Logistic Regression

In [89]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(C=1000.0,random_state=0)
lr.fit(X_train_std,y_train)

y_pred=lr.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of Logistic Regression Model: %.2f' % accuracy_score(y_test,y_pred))

Misclassified Samples: 2
Accuracy of Logistic Regression Model: 0.96


In [90]:
mods.append('Logistic Regression Model')
vals.append(accuracy_score(y_test,y_pred).round(2))

# 3. Support Vector Machine

In [91]:
from sklearn.svm import SVC

svm=SVC(kernel='linear',random_state=0,C=1.0)
svm.fit(X_train_std,y_train)

y_pred=svm.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of Support Vector Machine Model: %.2f' % accuracy_score(y_test,y_pred))


Misclassified Samples: 1
Accuracy of Support Vector Machine Model: 0.98


In [92]:
mods.append('Support Vector Machine')
vals.append(accuracy_score(y_test,y_pred).round(2))

In [93]:

from sklearn.svm import SVC

svm=SVC(kernel='rbf',random_state=0,C=1.0,gamma=0.1)
svm.fit(X_train_std,y_train)

y_pred=svm.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of Support Vector Machine Model with gamma: %.2f' % accuracy_score(y_test,y_pred))


Misclassified Samples: 2
Accuracy of Support Vector Machine Model with gamma: 0.96


# 4. Decision Tree Classifier

In [94]:
from sklearn.tree import DecisionTreeClassifier

tree=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=0)
tree.fit(X_train_std,y_train)

y_pred=tree.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of Decision Tree Classifier with gamma: %.2f' % accuracy_score(y_test,y_pred))


Misclassified Samples: 2
Accuracy of Decision Tree Classifier with gamma: 0.96


In [95]:
mods.append('Decision Tree Classifier')
vals.append(accuracy_score(y_test,y_pred).round(2))

# 5. Random Forest Classifier

In [96]:
from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier(criterion='entropy',n_estimators=10,random_state=1,n_jobs=2)
forest.fit(X_train_std,y_train)

y_pred=forest.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of Random Forest Classifier with gamma: %.2f' % accuracy_score(y_test,y_pred))


Misclassified Samples: 2
Accuracy of Random Forest Classifier with gamma: 0.96


In [97]:
mods.append('Random Forest Classifier')
vals.append(accuracy_score(y_test,y_pred).round(2))

# 6. KNN CLASSIFIER

In [98]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')
knn.fit(X_train_std,y_train)

y_pred=knn.predict(X_test_std)
print('Misclassified Samples: %d' % (y_test!=y_pred).sum())

from sklearn.metrics import accuracy_score
print('Accuracy of KNN Classifier with gamma: %.2f' % accuracy_score(y_test,y_pred))


Misclassified Samples: 2
Accuracy of KNN Classifier with gamma: 0.96


In [99]:
mods.append('KNN Classifier')
vals.append(accuracy_score(y_test,y_pred).round(2))

In [100]:
# For visualization
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Spectral6, brewer
from bokeh.transform import factor_cmap

ser_df=pd.DataFrame({'Model': mods, 'Accuracy_Value': vals})

In [102]:
source = ColumnDataSource(ser_df)

p = figure(x_range=mods, plot_width=1000, plot_height=700)
color_map = factor_cmap(field_name='Model', palette=Spectral6, factors=mods)
p.vbar(x='Model', top='Accuracy_Value', source=source, width=0.70, color=color_map)

p.title.text ='Comparison of Models'
p.xaxis.axis_label = 'Model Types'
p.yaxis.axis_label = "Accuracy value of different models"

show(p)

In [103]:
from sklearn.ensemble import RandomForestClassifier

feat_labels = df.columns[:-1]

forest = RandomForestClassifier(n_estimators=10000,
                                random_state=1,n_jobs=-1)

forest.fit(X_train_std, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[indices[f]], 
                            importances[indices[f]]))

 1) petal width                    0.451645
 2) petal length                   0.416091
 3) sepal length                   0.102635
 4) sepal width                    0.029628


In [108]:

ser_df=pd.DataFrame({'Features': feat_labels[indices].tolist(), 'Importance_Value': importances[indices].tolist()})

source = ColumnDataSource(ser_df)

p = figure(x_range=feat_labels[indices].tolist(), plot_width=800, plot_height=500)
color_map = factor_cmap(field_name='Features', palette=Spectral6, factors=feat_labels[indices].tolist())
p.vbar(x='Features', top='Importance_Value', source=source, width=0.70, color=color_map)

p.title.text ='Feature Importance'
p.xaxis.axis_label = 'Feature Name'
p.yaxis.axis_label = "Importance Score"

show(p)