The dataset can be obtained here: https://www.kaggle.com/datasets/arshid/iris-flower-dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from src.algorithms.decision_tree.decision_tree_classifier import DecisionTreeClassifier
from src.algorithms.random_forest.random_forest_classifier import RandomForestClassifier

col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
df = pd.read_csv("iris.csv", skiprows=1, header=None, names=col_names)
mapping_dict = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
df['type'] = df['type'].map(mapping_dict)
# df['type'] = np.random.permutation(df['type'].values)

df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
6,5.4,3.9,1.7,0.4,0
7,4.6,3.4,1.4,0.3,0
8,5.0,3.4,1.5,0.2,0
9,4.4,2.9,1.4,0.2,0
10,4.9,3.1,1.5,0.1,0


In [2]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3, criterion = 'gini_index')
classifier.fit(X_train, y_train)
classifier.print_tree()

X idx: 2  <=  1.9  ? | Info gain:  0.3121088435374149
 left: 0.0
 right: X idx: 2  <=  4.7  ? | Info gain:  0.25028684053074307
  left: X idx: 3  <=  1.5  ? | Info gain:  0.018470418470418425
    left: 1.0
    right: 2.0
  right: X idx: 3  <=  1.7  ? | Info gain:  0.027058036814134382
    left: X idx: 2  <=  4.9  ? | Info gain:  0.0126984126984127
        left: 1.0
        right: 2.0
    right: X idx: 2  <=  4.8  ? | Info gain:  0.005772005772005727
        left: 2.0
        right: 2.0


In [20]:
y_train_pred = classifier.predict(X_train) 
print('Accuracy: ', accuracy_score(y_train, y_train_pred))

Accuracy:  0.9714285714285714


In [5]:
y_train_pred

array([1., 2., 2., 1., 2., 1., 2., 1., 0., 2., 1., 0., 0., 0., 1., 2., 0.,
       0., 0., 1., 0., 1., 2., 0., 1., 2., 0., 2., 2., 1., 1., 2., 1., 0.,
       1., 2., 0., 0., 1., 2., 0., 2., 0., 0., 2., 1., 2., 2., 2., 2., 1.,
       0., 0., 2., 2., 0., 0., 0., 1., 2., 0., 2., 2., 0., 1., 1., 2., 1.,
       2., 0., 2., 1., 2., 1., 1., 1., 0., 1., 1., 0., 1., 2., 2., 0., 1.,
       2., 2., 0., 2., 0., 1., 2., 2., 1., 2., 1., 1., 2., 2., 0., 1., 2.,
       0., 1., 2.])

In [7]:
classifier.predict_proba(X_train)

array([[0.        , 1.        , 0.        ],
       [0.        , 0.33333333, 0.66666667],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.

In [9]:
y_test_pred = classifier.predict(X_test) 
print('Accuracy: ', accuracy_score(y_test, y_test_pred))

0.9555555555555556


In [12]:
randomforest = RandomForestClassifier(100, 3, 3)
randomforest.fit(X_train, y_train)

In [13]:
preds_rf = randomforest.predict(X_test)
preds_rf
print('Accuracy: ', accuracy_score(y_test, preds_rf))

1.0


In [15]:
preds_rf_proba = randomforest.predict_proba(X_test)
preds_rf_proba

array([[4.13435150e-14, 9.93033355e-01, 6.96664536e-03],
       [9.80468266e-01, 1.74010247e-02, 2.13070973e-03],
       [0.00000000e+00, 2.74185475e-06, 9.99997258e-01],
       [4.13435195e-14, 7.21196510e-01, 2.78803490e-01],
       [2.84672570e-18, 7.84698651e-01, 2.15301349e-01],
       [9.89582837e-01, 1.04170042e-02, 1.58945719e-07],
       [1.58942467e-20, 9.89395822e-01, 1.06041779e-02],
       [1.14440918e-06, 1.17203170e-02, 9.88278539e-01],
       [2.93438293e-07, 7.49381863e-01, 2.50617843e-01],
       [4.15212163e-14, 9.85221804e-01, 1.47781957e-02],
       [1.08991355e-06, 5.99944998e-03, 9.93999460e-01],
       [9.99990582e-01, 8.46385992e-06, 9.53674680e-07],
       [9.81249502e-01, 1.73298843e-02, 1.42061349e-03],
       [9.87536245e-01, 7.99908659e-03, 4.46466818e-03],
       [9.97685111e-01, 2.31488911e-03, 3.12802288e-22],
       [8.17435170e-07, 4.15521852e-01, 5.84477331e-01],
       [1.19209331e-07, 4.04659654e-03, 9.95953284e-01],
       [1.14159632e-20, 9.89395

In [16]:
classifier.feature_importance()

{2: 0.5808661025385764, 3: 0.04552845528455281}

In [17]:
classifier.feature_importance('n_splits')

{2: 4, 3: 2}

In [18]:
randomforest.feature_importance()

{0: 0.10767871132213996,
 1: 0.04013755357066883,
 2: 0.1899054227539028,
 3: 0.2094563800088434}

In [19]:
randomforest.feature_importance('n_splits')

{0: 2.7, 1: 1.54, 2: 1.21, 3: 1.38}