# Random Forest

In [2]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
#loading the library with the iris dataset 

#loading the scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

#Setting random seed
np.random.seed(0)

In [6]:
# iris = pd.read_csv('C:/Users/USER/Downloads/iris.csv')
# iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [10]:
#Creating an object called iris with the iris data
iris = load_iris()
#print(iris)

#Creating a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)

#viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
# Adding a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

#Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [399]:
# Creating Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <=.75
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [400]:
#Creating dataframes with test rowa and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False ]

#Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:', len(test))

Number of observations in the training data: 112
Number of observations in the test data: 38


In [401]:
# Create a list of the feature column's names
features = df.columns[:4]

# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [402]:
# Converting each species name into digits
y = pd.factorize(train['species'])[0]

# Viewing target
y
 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [403]:
# the 0 1 and 2 represent the flowers species types

# Creating a random forest classifier
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)

# Training the classifier
clf.fit(train[features], y)

In [404]:
test[features]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
7,5.0,3.4,1.5,0.2
9,4.9,3.1,1.5,0.1
12,4.8,3.0,1.4,0.1
13,4.3,3.0,1.1,0.1
21,5.1,3.7,1.5,0.4
26,5.0,3.4,1.6,0.4
33,5.5,4.2,1.4,0.2
37,4.9,3.6,1.4,0.1
40,5.0,3.5,1.3,0.3
42,4.4,3.2,1.3,0.2


In [405]:
#Applying the trained Classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2], dtype=int64)

In [406]:
# Let's exlpore the test features

#Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [0.  , 0.96, 0.04],
       [0.  , 0.93, 0.07],
       [0.  , 0.98, 0.02],
       [0.  , 0.98, 0.02],
       [0.  , 1.  , 0.  ],
       [0.  , 0.68, 0.32],
       [0.  , 0.98, 0.02],
       [0.  , 0.05, 0.95],
       [0.  , 0.99, 0.01]])

In [407]:
# Mapping names for the plants for each predicted plant class
y_preds = iris.target_names[clf.predict(test[features])]

# View the PREDICTED species for the first five observations
y_preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor'], dtype='<U10')

In [408]:
# Viewin the ACTUAL species for the first five observations
test['species'].head()

7     setosa
9     setosa
12    setosa
13    setosa
21    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [409]:
# Creating confusion matrix
pd.crosstab(test['species'], y_preds, rownames = ['Actual Species'], colnames = ['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,11,0,0
versicolor,0,13,1
virginica,0,1,12


In [412]:
preds = iris.target_names[clf.predict( [[5.0, 3.6, 1.4, 0.2],[5.0, 3.6, 1.4, 0.2]] )]
preds



array(['setosa', 'setosa'], dtype='<U10')