# Obtención y preparación de datos

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
cancer_dataset = load_breast_cancer()
df = pd.DataFrame(cancer_dataset.data, columns = cancer_dataset.feature_names)
df['Target'] = cancer_dataset.target[df.index]
df.head()

In [None]:
x = df.drop("Target", axis = 1)
y = df.Target

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

# Árbol de clasificación

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [None]:
clf.fit(x_train, y_train)

In [None]:
y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html#sklearn.tree.export_graphviz

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(clf, out_file = "Tree.ps")

### ¿Cómo transformar el código en una imagén?
Alternativa 1: https://edotor.net/
<br>Alternativa 2: http://www.webgraphviz.com/
<br>Alternativa 3: https://stamm-wilbrandt.de/GraphvizFiddle/#

### Tu árbol debe ser muy parecido al que tienes aquí en la libreta.

<center><img src="img/ClasfTree1.png" width = "100%"></center>

In [None]:
x.columns

In [None]:
feature_labels = []
for col in x.columns:
    feature_labels.append(col)

print(feature_labels)

In [None]:
class_labels = ['malignant', 'benign']

In [None]:
export_graphviz(clf, out_file = "TreeWithInfo.ps", 
                feature_names = feature_labels, 
                class_names = class_labels,
                filled = True,
                rounded = True
               )

### Tu árbol debe ser muy parecido al que tienes aquí en la libreta.

<center><img src="img/ClasfTree2.png" width = "100%"></center>

### Reporte de la clasificación

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_predict))
print(classification_report(y_test, y_test_predict))

### Matriz de confusión

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

### Selección de características

In [None]:
from sklearn.feature_selection import RFE
selector = RFE(clf, 8, step = 1)
selector = selector.fit(x_train, y_train.ravel())
print(selector.support_)
print(selector.ranking_)

In [None]:
n = int(selector.support_.size)

for i in range(n):
    if selector.support_[i] == True:
        print(cancer_dataset.feature_names[i])

# Volvamos a hacer el árbol pero con una copia reducida de los datos

In [None]:
df_reduced = df.drop("Target", axis = 1)
df_reduced = df_reduced.loc[:, selector.support_]
df_reduced.head()

In [None]:
x = df_reduced
y = df.Target

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

print(x.columns)
class_labels = ['malignant', 'benign']
feature_labels = []
for col in x.columns:
    feature_labels.append(col)

export_graphviz(clf, out_file = "ReducedTreeWithInfo.ps", 
                feature_names = feature_labels, 
                class_names = class_labels,
                filled = True,
                rounded = True
               )

print("-----------------------------------------------------")
print(classification_report(y_train, y_train_predict))
print("-----------------------------------------------------")
print(classification_report(y_test, y_test_predict))
print("-----------------------------------------------------")

cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

### Tu árbol debe ser muy parecido al que tienes aquí en la libreta.

<center><img src="img/ClasfTree3.png" width = "100%"></center>

# Ahora analicemos un ejemplo donde existen multíples clases.

# Obtención y preparación de datos

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
dataset_description = iris.DESCR
print(dataset_description)

In [None]:
df2 = pd.DataFrame(iris.data, columns = iris.feature_names)
df2.head()

In [None]:
df2['Target']=iris.target[df2.index]
df2.head()

In [None]:
df2['Target'].unique()

In [None]:
x = df2.drop("Target", axis = 1)
y = df2.Target

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

print(x.columns)
class_labels = ['Setosa', 'Versicolour', 'Virginica']
feature_labels = []
for col in x.columns:
    feature_labels.append(col)

export_graphviz(clf, out_file = "IrisTreeWithInfo.ps", 
                feature_names = feature_labels, 
                class_names = class_labels,
                filled = True,
                rounded = True
               )

print("-----------------------------------------------------")
print(classification_report(y_train, y_train_predict))
print("-----------------------------------------------------")
print(classification_report(y_test, y_test_predict))
print("-----------------------------------------------------")

cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

### Tu árbol debe ser muy parecido al que tienes aquí en la libreta.

<center><img src="img/ClasfTree4.png" width = "80%"></center>