In [1]:
# Loading the libarary with IRIS datasets
from sklearn.datasets import load_iris

# Loading the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Loading pandas, numpy
import pandas as pd
import numpy as np

# Setting random seed 
np.random.seed(0)

In [10]:
# Creating an object called iris with iris data
iris = load_iris()
# print(iris)

# Converting the iris data to a dataframe
irisDf = pd.DataFrame(iris.data, columns=iris.feature_names)

irisDf.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
# Adding a new column for the species name 
irisDf['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

irisDf.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [21]:
# Creating Test and Train Data

# The below line will result in TRUE for 75% of the data which will be used for training the model

irisDf['isTrain'] = np.random.uniform(0, 1, len(irisDf)) <= 0.75

irisDf.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,isTrain
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [22]:
# Creating dataframes with training records and testing records
train, test = irisDf[irisDf['isTrain'] == True], irisDf[irisDf['isTrain'] == False]

# Displaying the total number of records in each dataset
print("Number of observations in Training Dataset : ", train.shape[0])
print("Number of observations in Testing Dataset : ", test.shape[0])

Number of observations in Training Dataset :  113
Number of observations in Testing Dataset :  37


In [23]:
# Create list of features names
features = irisDf.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [27]:
# Converting each species name into digits from the training set
y = pd.factorize(train.species)[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2], dtype=int64)

In [28]:
# Creating Random Forest Classifier
my_model = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the model with the training set
my_model.fit(train[features], y)

In [39]:
# Apply the trained model with the test set for predicting
preds = my_model.predict(test[features])

In [40]:
# Viewing the predicted probabilities of the first 10 observation of the test set
my_model.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.95, 0.05],
       [0.  , 0.85, 0.15],
       [0.  , 0.98, 0.02],
       [0.  , 0.97, 0.03],
       [0.01, 0.99, 0.  ],
       [0.01, 0.98, 0.01],
       [0.  , 1.  , 0.  ]])

In [48]:
# Mapping names of the species to the predicted value of the class
preds_names = iris.target_names[preds]
preds_names[0:5]
# preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [46]:
# Displaying the actual species name of the test set
test.species.head()

0     setosa
1     setosa
6     setosa
10    setosa
12    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [52]:
# Creating Confusion Matrix
pd.crosstab(test['species'], preds_names, rownames=['Actual Species'], colnames=['Predicted Species'] )

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,14,3
virginica,0,0,7


In [58]:
# Actual using of model after deployment

preds_names = iris.target_names[my_model.predict([[5.1, 3.5, 1.4, 0.2],[4.6, 3.1, 1.5, 0.2]])]
preds_names



array(['setosa', 'setosa'], dtype='<U10')