In [2]:
# loading the library for dataset of iris.
from sklearn.datasets import load_iris

# loading the scikit's random forest classifier library.
from sklearn.ensemble import RandomForestClassifier

# loading pandas.
import pandas as pd

# loading numpy.
import numpy as np

In [4]:
# setting random seeds.
np.random.seed(0)

In [5]:
# creating an object called iris with dataset.
iris = load_iris()

# print(iris)

In [6]:
# Creating the dataset with feature variables.
df = pd.DataFrame(iris.data, columns = iris.feature_names)

# printing top 5 rows.
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
# adding one more column as species from dataset.
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)

# printing top 5 rows.
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
df['species'].unique()

[setosa, versicolor, virginica]
Categories (3, object): [setosa, versicolor, virginica]

In [9]:
# Creating the train and test data.
df['is_train'] = np.random.uniform(0,1, len(df))<= .75

# printing top 5 rows.
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [10]:
# creating a dataframes for test and train data.
train,test = df[df['is_train']==True], df[df['is_train']==False]

# print the number of observations in train and test dataframe.
print('number of observations of train data is :', len(train))
print('number of observations of test data is :', len(test))

number of observations of train data is : 118
number of observations of test data is : 32


In [11]:
# Creating the list of features.
features = df.columns[:4]

# showing the list of features.
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [12]:
df['species'].unique()

[setosa, versicolor, virginica]
Categories (3, object): [setosa, versicolor, virginica]

In [13]:
# converting each species name into digits.
# [0] because in columns there is array.
y = pd.factorize(train['species'])[0]

# Viewing target.
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


In [14]:
# creating the ramdom forest classifier.
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)

# training the classifier.
clf.fit(train[features],y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
# applying train classifier to test.
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [16]:
# Veiwing the predicted probability of first 10 rows.
clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.95, 0.05, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

The clf.predict_proba function shows the probability of target names in array.
i.e. first array[1.,0.,0.] it means the target of 1st observations in test data is  Sentosa because the probability of sentosa is 100%,versicolor 0% and virginica 0%.

In [17]:
# Veiwing the predicted probability of first 10 rows.
clf.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])

In [18]:
# maping the names.
preds = iris.target_names[clf.predict(test[features])]

# view the predicted species for the first 5 observations.
preds[0:15]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor'], dtype='<U10')

In [19]:
# veiwing the 1st five actual species from the data.
test['species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

As we can see that the predict data i.e preds is same as test['species'] data i.e. actual data.
it means that the predictions are correct.

In [21]:
# creating confusion matrix.
pd.crosstab(test['species'],preds, rownames = ['actualSpecies'], colnames = ['predictedSpecies'])

predictedSpecies,setosa,versicolor,virginica
actualSpecies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [23]:
#Example: which you want to predict.

preds = iris.target_names[clf.predict( [[5.0,3.6,1.4,0.2],[5.0,3.6,1.4,0.2]] )]
preds

array(['setosa', 'setosa'], dtype='<U10')