In [1]:
# Loading in the data

import pandas as pd

iris = pd.read_csv('/Users/Tejas/csv_files/iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
# Label encoding the target variable for classification

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
iris_species_encoded = pd.Series(le.fit_transform(iris.species))

iris = pd.concat([iris, iris_species_encoded], axis=1)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,0
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [3]:
# Renaming the column, because python thought it would be a good idea to name the column as int 0 after Label Encoding

iris = iris.rename(columns={0:'species_encoded'})
iris.drop('species', axis=1, inplace=True)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_encoded
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
# Setosa ==> 0
# Virginica ==> 1
# Versicolor ==> 2

In [5]:
# Defining model

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [6]:
# Creating train and test sets

from sklearn.model_selection import train_test_split

X = iris.copy()
y = X.pop('species_encoded')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, train_size=.8, random_state=42)

In [7]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
22,4.6,3.6,1.0,0.2
15,5.7,4.4,1.5,0.4
65,6.7,3.1,4.4,1.4
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2


In [8]:
X_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [9]:
y_train.head()

22    0
15    0
65    1
11    0
42    0
Name: species_encoded, dtype: int64

In [10]:
# Get an error metric 

from sklearn.metrics import mean_absolute_error

rfc.fit(X_train, y_train)
predictions = rfc.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

print(f"MAE: {mae}")

MAE: 0.0


In [11]:
predictions

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [12]:
# We get an MAE of 0

In [13]:
# Let's try cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

rfc_scores = cross_val_score(rfc, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rfc_scores_rmse = np.sqrt(-rfc_scores)

def display_scores(scores):
    print(f"Scores: {scores}")
    print(f"Mean Scores: {scores.mean()}")
    print(f"Standard Deviation: {scores.std()}")
    
    
display_scores(rfc_scores_rmse)

Scores: [0.28867513 0.         0.28867513 0.         0.57735027 0.40824829
 0.         0.         0.         0.28867513]
Mean Scores: 0.18516239634379275
Standard Deviation: 0.20177930265570895
