# Obtención de datos

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
cancer_dataset = load_breast_cancer()
print(cancer_dataset.data.shape)

In [None]:
dataset_description = cancer_dataset.DESCR
print(dataset_description)

In [None]:
print(cancer_dataset.feature_names)

In [None]:
x = cancer_dataset.data
print(x)

In [None]:
print(cancer_dataset.target_names)

In [None]:
y = cancer_dataset.target
print(y)

In [None]:
df = pd.DataFrame(cancer_dataset.data, columns = cancer_dataset.feature_names)
df['Target'] = cancer_dataset.target[df.index]
df.head()

In [None]:
x = df.drop("Target", axis = 1)
y = df.Target

In [None]:
x.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

# Creando nuestra primera máquina de soporte vectorial para el problema de clasificación

In [None]:
from sklearn import svm
clf = svm.SVC(kernel = 'linear')

In [None]:
clf.fit(x_train, y_train)

In [None]:
y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

In [None]:
from sklearn import metrics

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_test_predict))

In [None]:
print("Precision:",metrics.precision_score(y_test, y_test_predict))

In [None]:
print("Recall:",metrics.recall_score(y_test, y_test_predict))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_predict))
print(classification_report(y_test, y_test_predict))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_predict)

In [None]:
cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

# Selección de características importantes que impactan en el modelo de clasificación

https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
from sklearn.feature_selection import RFE
selector = RFE(clf, 8, step = 1)
selector = selector.fit(x_train, y_train.ravel())
print(selector.support_)
print(selector.ranking_)

In [None]:
n = int(selector.support_.size)

for i in range(n):
    if selector.support_[i] == True:
        print(cancer_dataset.feature_names[i])

# Hagamos una copia reducida de nuestros datos

In [None]:
df_reduced = df.drop("Target", axis = 1)
df_reduced = df_reduced.loc[:, selector.support_]
df_reduced.head()

In [None]:
x = df_reduced
y = df.Target

In [None]:
print(x.shape)
x.head()

In [None]:
print(y.shape)
y.head()

## Ahora volvamos a hacer el módelo de clasificación.

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

x = df_reduced
y = df.Target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

clf = svm.SVC(kernel = 'linear')
clf.fit(x_train, y_train)

y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

print("-----------------------------------------------------")
print(classification_report(y_train, y_train_predict))
print("-----------------------------------------------------")
print(classification_report(y_test, y_test_predict))
print("-----------------------------------------------------")

cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

# Probando Otro Kernel (Cuadrático)

In [None]:
x = df
y = df.Target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.30, random_state=100)

clf = svm.SVC(kernel = 'poly', degree = 2)
clf.fit(x_train, y_train)

y_train_predict = clf.predict(x_train)
y_test_predict = clf.predict(x_test)

print("-----------------------------------------------------")
print(classification_report(y_train, y_train_predict))
print("-----------------------------------------------------")
print(classification_report(y_test, y_test_predict))
print("-----------------------------------------------------")

cm = confusion_matrix(y_train, y_train_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

cm = confusion_matrix(y_test, y_test_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de prueba')
plt.show()

# Otras Modificaciones posibles:
- Cambiar el Kernel de la SVM.
- Modificar los parámetros del Kernel seleccionado.
- Modificar el tamaño de los conjuntos de entrenamiento y prueba.
- Seleccionar otro conjunto de variables.