In [21]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [23]:
np.random.seed(0)

In [24]:
iris = load_iris()

In [25]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [26]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [27]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [28]:
trainRandom = np.random.rand(len(df)) < 0.8
train = df[trainRandom]
test = df[~trainRandom]

In [29]:
print('Numero de observaciones en el set de entrenamiento = ',len(train))
print('Numero de observaciones en el set de prueba = ',len(test))

Numero de observaciones en el set de entrenamiento =  123
Numero de observaciones en el set de prueba =  27


In [30]:
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [32]:
y = pd.factorize(train['species'])[0]

In [34]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [35]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2])

In [37]:
preds = iris.target_names[clf.predict(test[features])]
preds[0:15]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'virginica', 'versicolor', 'versicolor'], dtype='<U10')

In [38]:
test['species'].head()

7     setosa
8     setosa
13    setosa
17    setosa
19    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [39]:
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,8,0,0
versicolor,0,5,2
virginica,0,0,12


In [40]:
list(zip(train[features], clf.feature_importances_))

[('sepal length (cm)', 0.0889479467793623),
 ('sepal width (cm)', 0.0234427551884027),
 ('petal length (cm)', 0.44105878808840404),
 ('petal width (cm)', 0.44655050994383105)]