# **Intelligent Systems for Bioinformatics**
### Exercise 9: RandomForestClassifier using the "iris.csv" dataset 

In [1]:
import numpy as np
from si.io.csv_file import read_csv
from si.model_selection.split import train_test_split
from si.models.random_forest_classifier import RandomForestClassifier

In [2]:
dataset = read_csv("../datasets/iris/iris.csv", sep=",", features=True, label=True)

# Split the data into train and test sets
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

print(f"Train dataset shape: {train_dataset.shape()}")
print(f"Test dataset shape: {test_dataset.shape()}")

Train dataset shape: (120, 4)
Test dataset shape: (30, 4)


In [3]:
# Create the RandomForestClassifier model
# n_estimators=100: The forest will contain 100 decision trees
# min_sample_split=2: Nodes can split if they have at least 2 samples
# max_depth=5: Limits the depth of trees to prevent overfitting
# mode='gini': Uses Gini Impurity for splitting criteria
rf_model = RandomForestClassifier(
    n_estimators=100, 
    min_sample_split=2, 
    max_depth=5, 
    mode='gini', 
    seed=42
)

# Train the model
rf_model.fit(train_dataset)

# Calculate the score (accuracy) on the test set
accuracy = rf_model.score(test_dataset)

print(f"\nAccuracy on test set: {accuracy:.4f}")

# Prediction vs real values for the first few samples
predictions = rf_model.predict(test_dataset)
print("\n--- Sample Predictions ---")
print(f"\nPredicted: {predictions[:10]}")
print(f"\nActual:    {test_dataset.y[:10]}")


Accuracy on test set: 1.0000

--- Sample Predictions ---

Predicted: ['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor']

Actual:    ['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor']
