# Obtención de datos

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
cancer_dataset = load_breast_cancer()
df = pd.DataFrame(cancer_dataset.data, columns = cancer_dataset.feature_names)
df['Target'] = cancer_dataset.target[df.index]
df.head()

In [None]:
x = df.drop("Target", axis = 1)
y = df.Target

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

# Creando nuestro primer bosque aleatorio

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=100);

In [None]:
clf.fit(x_train, y_train)

In [None]:
y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

In [None]:
clf.estimators_

### Importando un modelo obtenido por el bosque aleatorio.

In [None]:
arbol = clf.estimators_[0]

In [None]:
feature_labels = []
for col in x.columns:
    feature_labels.append(col)

print(feature_labels)

In [None]:
class_labels = ['malignant', 'benign']

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(arbol, out_file = "RandomForest0.ps", 
                feature_names = feature_labels, 
                class_names = class_labels,
                filled = True,
                rounded = True
               )

### Tu árbol debe ser muy parecido al que tienes aquí en la libreta.

<center><img src="img/RandomForest1.png" width = "100%"></center>

### Usando el ciclo "for" podemos grabar todos los diferentes árboles que produce el bosque.

In [None]:
feature_labels = []
for col in x.columns:
    feature_labels.append(col)

class_labels = ['malignant', 'benign']

n = clf.n_estimators

for i in range(n):
    arbol = clf.estimators_[i]
    file_name = str(i) + ".ps"
    export_graphviz(arbol, out_file = file_name, 
                feature_names = feature_labels, 
                class_names = class_labels,
                filled = True,
                rounded = True
               )

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_predict))
print(classification_report(y_test, y_test_predict))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

# Visualizando la importancia de las características al modelo

In [None]:
clf.feature_importances_

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index=x.columns)
feature_imp

In [None]:
feature_imp = feature_imp.sort_values(ascending = False)
feature_imp

In [None]:
plt.figure(figsize=(6,15))
sns.barplot(x = feature_imp, y = feature_imp.index)
plt.xlabel('Importancia')
plt.ylabel('Característica')
plt.title("Visualizando la importancia de cada Característica")
plt.show()

# Hagamos una base de datos resumida tomando en cuenta los cinco datos más representativos

In [None]:
feature_imp.head()

In [None]:
feature_imp.index[0]

In [None]:
columns = []
for i in range(5):
    columns.append(feature_imp.index[i])

columns

In [None]:
df_reduced = pd.DataFrame(columns = columns)
df_reduced.head()

In [None]:
for i in range(5):
    df_reduced[feature_imp.index[i]] = df[feature_imp.index[i]]
    
df_reduced.head()

In [None]:
x = df_reduced
y = df.Target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

clf = RandomForestClassifier(n_estimators=100, random_state=100)

clf.fit(x_train, y_train)

y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

print(classification_report(y_train, y_train_predict))
print(classification_report(y_test, y_test_predict))

cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index = x.columns)
feature_imp = feature_imp.sort_values(ascending = False)

plt.figure(figsize=(6,4))
sns.barplot(x = feature_imp, y = feature_imp.index)
plt.xlabel('Importancia')
plt.ylabel('Característica')
plt.title("Visualizando la importancia de cada Característica")
plt.show()