In [39]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [40]:
np.random.seed(0) 

# Preparar dataset para Machine Learning

In [41]:
iris = load_iris()

In [42]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [43]:
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [44]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

In [45]:
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [46]:
trainRandom = np.random.rand(len(df)) < 0.8

In [47]:
train = df[trainRandom]

In [48]:
len(train)

123

In [49]:
test = df[~trainRandom]  #Asignar el sobrante del DF

In [50]:
len(test)

27

In [51]:
print('Number of observations in the training set:', len(train))
print('Number of observations in the test set:', len(test))

Number of observations in the training set: 123
Number of observations in the test set: 27


# Validacion de Datos

In [52]:
features = df.columns[:4]

In [53]:
df.species

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [54]:
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [55]:
# Pasar el texto de respuesta a numero
y = pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [56]:
# Clasificador, con 2 "trabajos" y un random de 0
clf = RandomForestClassifier(n_jobs=2, random_state= 0)
clf.fit(train[features], y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [57]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2], dtype=int64)

In [58]:
preds = iris.target_names[clf.predict(test[features])]
preds[0:15]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'virginica', 'versicolor', 'versicolor'], dtype='<U10')

In [59]:
test['species'].head()

7     setosa
8     setosa
13    setosa
17    setosa
19    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [60]:
pd.crosstab(test['species'], preds, rownames= ['Actual Species'], colnames= ['Predicted Species'])
# Matriz de confusion con xy.labels

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,8,0,0
versicolor,0,5,2
virginica,0,0,12


In [61]:
list(zip(train[features], clf.feature_importances_))

[('sepal length (cm)', 0.0889479467793623),
 ('sepal width (cm)', 0.0234427551884027),
 ('petal length (cm)', 0.44105878808840404),
 ('petal width (cm)', 0.44655050994383105)]