# Experimenting with the Random Forest Model and Iris Dataset

Looking to see how accuratly the Iris dataset can be determined using a Random Forest Model

## Imports and Reading Data

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv('data/iris.csv')
n = len(data)
# Used this to randomize the data
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.4,1.5,0.2,Iris-setosa
1,6.4,2.8,5.6,2.2,Iris-virginica
2,6.8,2.8,4.8,1.4,Iris-versicolor
3,6.0,3.4,4.5,1.6,Iris-versicolor
4,4.7,3.2,1.6,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.3,5.7,2.1,Iris-virginica
146,6.2,2.2,4.5,1.5,Iris-versicolor
147,6.4,3.2,5.3,2.3,Iris-virginica
148,5.6,3.0,4.1,1.3,Iris-versicolor


## Data Cleaning

In [3]:
# Wrote this to convert the names of the flowers to the specific type that they are
def parse_flower_type(Y):
    y = []
    for i in range(len(Y)):
        if (Y[i] == 'Iris-setosa'):
            y.append(1)
        elif (Y[i] == 'Iris-versicolor'):
            y.append(2)
        else:
            y.append(3)
    return y


In [4]:
training_factor = 0.6
training_size = round(training_factor * n)


Y_train = data.iloc[:training_size, 4].values
y_train = parse_flower_type(Y_train)

Y_test = data.iloc[training_size:, 4].values
y_test = parse_flower_type(Y_test)
x_train = data.iloc[:training_size, 0:4]
x_test = data.iloc[training_size:, 0:4]


## Model Fitting and Testing

In [11]:
reg = RandomForestRegressor(n_estimators=100, max_depth=2)

reg.fit(x_train, y_train)

pred = reg.predict(x_test)

mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"Mean-Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean-Squared Error: 0.03528404639786864
R2 Score: 0.9483438116989317
1.0


