In [17]:
##Loading the libraries with iris data
from sklearn.datasets import load_iris
## Loading scikit random forest classifier library
from sklearn.ensemble import RandomForestClassifier
# Loading pandas and numpy
import pandas as pd
import numpy as np
# Setting a random seed for reproducibility
np.random.seed(0)

In [29]:
# Loading the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Labels

# Converting the dataset to a pandas DataFrame for better readability
df = pd.DataFrame(data=X, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [30]:
df["Species"]=pd.Categorical.from_codes(iris.target,iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [31]:
# Creating a column 'is_train' to indicate whether a row is part of the training set
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75

# Viewing the first 5 rows of the updated dataframe
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [32]:
# Splitting the DataFrame into training and test sets based on the 'is_train' column
train = df[df['is_train'] == True]
test = df[df['is_train'] == False]

# Showing the number of observations in each set
print("Number of observations in the training data:", len(train))
print("Number of observations in the test data:", len(test))

Number of observations in the training data: 112
Number of observations in the test data: 38


In [33]:
## Create a list of the feature columns name
features=df.columns[:4]
## view features
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [34]:
### Converting each species name into digit
y=pd.factorize(train['Species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [35]:
## Create a random forest classifier
clf=RandomForestClassifier(n_jobs=2,random_state=0)
## Training the classifier
clf.fit(train[features],y)

In [37]:
## Applying the trained classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [38]:
## Viewing the predicted probabilities for the first 10 obserbation 
clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.97, 0.03, 0.  ],
       [0.97, 0.03, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [42]:
## Mapping name for the plant class for each predicted plant class
preds=iris.target_names[clf.predict(test[features])]
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica'], dtype='<U10')

In [46]:
test['Species'].head(9)

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
15    setosa
24    setosa
27    setosa
34    setosa
Name: Species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [47]:
##Creatng confusion metrics
pd.crosstab(test['Species'],preds,rownames=["Actual Species"],colnames=["predicted species"])

predicted species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19
