# Building a Classification Model for the Iris data set

Chanin Nantasenamat

<i>Data Professor YouTube channel, http://youtube.com/dataprofessor </i>

In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm.

## 1. Import libraries

In [12]:
# from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd

## 2. Load the *iris* data set

In [13]:
file_path = "D:/Data project/DATA science/Github/Machine-Learning-Repository/Mushroom dataset/mushroom metadata/agaricus-lepiota.data" # Replace with the actual file path

# Define column names based on the dataset's attributes
column_names = [
    "class",
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat"
]

# Read the data file into a Pandas DataFrame
data = pd.read_csv(file_path, names=column_names)

## 3. Input features
The ***iris*** data set contains 4 input features and 1 output variable (the class label).

### 3.1. Input features

In [14]:
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


### 3.2. Output features

In [15]:
print(data['class'])

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object


## 4. Glimpse of the data

### 4.1. Input features

In [16]:
data.drop(columns = 'class')

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


### 4.2. Output variable (the Class label)

In [17]:
data.iloc[:,0]

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [18]:
# Convert "class" to binary

data['class'] = (data['class'] == 'p').astype(int)

# One hot Encoder
data_ohe = pd.get_dummies(data)
print(data_ohe.shape)
data_ohe

X = data_ohe.drop(columns=['class'])
Y = data_ohe['class']

(8124, 118)


### 4.3. Let's examine the data dimension

In [19]:
X.shape

(8124, 117)

In [20]:
Y.shape

(8124,)

## 5. Build Classification Model using Random Forest

In [21]:
clf = RandomForestClassifier()

In [22]:
model = clf.fit(X, Y)

## 6. Feature Importance

In [23]:
print(clf.feature_importances_)

[2.07941894e-03 7.83725331e-05 6.34663082e-04 4.40455542e-04
 7.71680238e-04 8.15802010e-04 5.48777175e-03 1.79120297e-04
 6.87866464e-03 8.59345994e-04 3.11878961e-03 6.70590136e-04
 1.28515927e-05 1.02881491e-03 1.34108332e-03 2.31281989e-03
 2.80962421e-04 3.19333465e-04 3.58888411e-03 4.31691271e-03
 2.91429520e-02 2.46306289e-02 5.45385505e-03 9.75213046e-03
 7.99512796e-02 4.14398617e-03 3.49828389e-04 1.31495807e-01
 1.56063043e-02 3.27591515e-03 4.03863632e-03 9.84098027e-05
 1.94008809e-03 1.66081940e-02 1.38341398e-02 4.79938847e-02
 6.44400084e-02 3.85209392e-02 2.67778573e-04 1.05422585e-03
 2.60336840e-04 2.84957913e-04 2.58716392e-03 1.04670066e-05
 9.20985328e-05 1.83971247e-03 2.31903364e-04 2.08675031e-03
 1.77815540e-04 9.91716562e-03 1.90163000e-02 1.48488343e-02
 1.53565538e-02 1.02374433e-02 2.39154055e-02 2.53503744e-03
 1.70943650e-03 3.98545186e-02 2.92929740e-02 2.48933288e-04
 6.15232288e-03 3.65199442e-02 6.75657709e-03 3.35355404e-03
 1.80708150e-03 8.654210

## 7. Make Prediction

In [24]:
data_ohe.iloc[8123,1:].values

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0], dtype=int32)

In [None]:
print(clf.predict([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0]]))

[0]




In [26]:
print(clf.predict([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0]]))

[0]




In [27]:
print(clf.predict_proba([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0]]))

[[1. 0.]]




## 8. Data split (80/20 ratio)

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [29]:
X_train.shape, Y_train.shape

((6499, 117), (6499,))

In [30]:
X_test.shape, Y_test.shape

((1625, 117), (1625,))

## 9. Rebuild the Random Forest Model

In [31]:
clf.fit(X_train, Y_train)

### 9.1. Performs prediction on single sample from the data set

In [32]:
print(clf.predict([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0]]))

[0]




In [33]:
print(clf.predict_proba([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0]]))

[[1. 0.]]




### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [34]:
print(clf.predict(X_test))

[0 1 1 ... 0 0 1]


#### *Actual class labels*

In [35]:
print(Y_test)

2055    0
4370    1
6095    1
398     0
4144    1
       ..
5962    0
5061    1
7699    0
7       0
4676    1
Name: class, Length: 1625, dtype: int32


## 10. Model Performance

In [36]:
print(clf.score(X_test, Y_test))

1.0


## Visualizing a Decision Tree

In [42]:
from sklearn.tree import export_graphviz

# Assuming 'model' is your trained random forest classifier
# Extract individual decision trees from the random forest
estimators = model.estimators_
feature_names_ = list(X.columns[0:])

# Loop through each decision tree and export it
for i, tree in enumerate(estimators):
    export_graphviz(tree, 
                    out_file=f'D:/Data project/DATA science/Github/Machine-Learning-Repository/Mushroom dataset/Deferensial method/Random forest/trees/Tree_{i}.dot',
                    feature_names=feature_names_,
                    class_names=['0', '1'],
                    filled=True,
                    rounded=True)

# You will have one .dot file for each tree in the random forest
