In [1]:
from sklearn import datasets
iris=datasets.load_iris()

In [2]:
# There are three targets (labels) in the sample, which are labeled as 0(setosa), 1(versicolor), 2(virginica)  
print(iris.target_names) 
print(iris.feature_names)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [3]:
# dividing the datasets into two parts i.e. training datasets and test datasets
# X are samples and y are labels (targets)
X, y = datasets.load_iris( return_X_y = True)

# Splitting arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
# i.e. 70 % training dataset and 30 % test datasets. When processing experimental data, we can use K-Fold division instead of 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)



# This part is for data visualization, showing the data values from iris data set
import pandas as pd 
data = pd.DataFrame({'sepallength': iris.data[:, 0], 'sepalwidth': iris.data[:, 1], 
					'petallength': iris.data[:, 2], 'petalwidth': iris.data[:, 3], 
					'species': iris.target}) 
print(data.head(20)) 

    sepallength  sepalwidth  petallength  petalwidth  species
0           5.1         3.5          1.4         0.2        0
1           4.9         3.0          1.4         0.2        0
2           4.7         3.2          1.3         0.2        0
3           4.6         3.1          1.5         0.2        0
4           5.0         3.6          1.4         0.2        0
5           5.4         3.9          1.7         0.4        0
6           4.6         3.4          1.4         0.3        0
7           5.0         3.4          1.5         0.2        0
8           4.4         2.9          1.4         0.2        0
9           4.9         3.1          1.5         0.1        0
10          5.4         3.7          1.5         0.2        0
11          4.8         3.4          1.6         0.2        0
12          4.8         3.0          1.4         0.1        0
13          4.3         3.0          1.1         0.1        0
14          5.8         4.0          1.2         0.2        0
15      

In [4]:
from sklearn.ensemble import RandomForestClassifier 
clf = RandomForestClassifier( n_estimators= 100)
clf.fit (X_train, y_train)
# performing predictions on the test dataset 
y_pred = clf.predict(X_test) 
# metrics are used to find accuracy or error 
from sklearn import metrics   
print() 
# using metrics module for accuracy calculation 
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred)) 



ACCURACY OF THE MODEL:  0.9777777777777777


In [10]:
# Some examples to show the predictions of the model. One thing is that the the model cannot exclude bad examples with invalid paras
clf.predict([[3, 3, 2, 2],
             [3, 3, 1, 1],
             [5, 4, 3, 2],
             [3.4 , 5.5, 4.3, 7.9],
             [0, 0, 0, 0],
             [1e4, 1e4, 1e4, 1e4],
             [-1, -1, -1, -1],
             [-1e4, -1e4, -1e4, -1e4]])

array([2, 1, 2, 2, 0, 2, 0, 0])

In [11]:
# To calculate the feature importance 
feature_imp = pd.Series(clf.feature_importances_, index = iris.feature_names).sort_values(ascending = False) 
feature_imp

petal width (cm)     0.461855
petal length (cm)    0.388065
sepal length (cm)    0.110010
sepal width (cm)     0.040070
dtype: float64