In [1]:
# Loading the library with Iris dataset
from sklearn.datasets import load_iris

In [2]:
# Loading scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Loading pandas
import pandas as pd

In [4]:
# Loading numpy
import numpy as np

In [5]:
# Setting random seed - keeps randomness reproducible
np.random.seed(0)

In [6]:
# Creating an object called iris with the iris data
iris = load_iris()

iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [7]:
# Creating a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=[iris.feature_names])

In [8]:
# Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
# Adding a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

In [10]:
# Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [11]:
df.columns

MultiIndex([('sepal length (cm)',),
            ( 'sepal width (cm)',),
            ('petal length (cm)',),
            ( 'petal width (cm)',),
            (          'species',)],
           )

In [12]:
# Flatten MultiIndex columns
df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]

df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')

In [13]:
# Creating train and test data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

In [14]:
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [15]:
# Creating dataframes with train rows and test rows
train, test = df[df['is_train']==True], df[df['is_train'] == False]

In [16]:
# Show the number of observations for train and test dataframes
print(f"No. of observations in train dataset: {len(train)}")
print(f"No. of observations in test dataset: {len(test)}")

No. of observations in train dataset: 118
No. of observations in test dataset: 32


In [17]:
# Create a list of the feature column's names
features = df.columns[:4]

# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [18]:
# Converting each species name into digits
y = pd.factorize(train['species'])[0]

In [19]:
# Viewing target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

In [20]:
# Creating a Random Forest classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [21]:
clf.fit(train[features], y)

In [22]:
test[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
10,5.4,3.7,1.5,0.2
13,4.3,3.0,1.1,0.1
17,5.1,3.5,1.4,0.3


In [23]:
# Applying the trained classifier to the test data
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [24]:
test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
7,5.0,3.4,1.5,0.2,setosa,False
8,4.4,2.9,1.4,0.2,setosa,False
10,5.4,3.7,1.5,0.2,setosa,False
13,4.3,3.0,1.1,0.1,setosa,False
17,5.1,3.5,1.4,0.3,setosa,False


In [25]:
# Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.95, 0.05, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [26]:
# Mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

In [27]:
# View the preidcted species for the first twenty five observations
preds[:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [28]:
# Viewing actual species for the first 25 test observations
test['species'].head(25)

7          setosa
8          setosa
10         setosa
13         setosa
17         setosa
18         setosa
19         setosa
20         setosa
21         setosa
23         setosa
27         setosa
31         setosa
38         setosa
52     versicolor
66     versicolor
68     versicolor
70     versicolor
72     versicolor
89     versicolor
98     versicolor
103     virginica
109     virginica
111     virginica
114     virginica
116     virginica
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [29]:
# Creating Confusion Matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(test['species'], preds)

0.9375

In [31]:
# Predicting new flowers
preds = iris.target_names[clf.predict([[6.0, 3.6, 5.4, 2.0], [5.0, 3.6, 1.4, 2.0]])]

preds



array(['virginica', 'setosa'], dtype='<U10')