## Modelo de predición de cancer de mama (Wisconsin uci)

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from random import random 



ModuleNotFoundError: No module named 'xgboost'

In [None]:
# Importamos los datos desde el repositorio oficial
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',
header=None)

In [None]:
# Vemos la estructura del dataframe
# Columna 0 --> ID
# Columna 1 --> Clase
# Columnas 2.. -> Datos
df[:2]

In [None]:
# Separamos el dataset en (Conjunto de datos, Etiquetas)
X = df.loc[:, 2:].values
y = df.loc[:, 1].values

In [None]:
# Separamos el conjunto total en dos subconjuntos (Train 80% / Test 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
weak_clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)

In [None]:
# Creamos un flujo de trabajo (pipeline):
# Paso 1 -> Escalar los datos
# Paso 2 -> Analisis de componentes principales (Reduce la dimensionalidad)
# Paso 3 -> Construimos un modelo utilizando Regresión Logística


pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', XGBClassifier())])

# Ajustamos el modelo a los datos de entrenamiento
pipe_lr.fit(X_train, y_train)

#Calculamos la precisión del modelo utilizando los datos de test
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

In [None]:
# Como cambia el accuracy al aumentar el ruido:

In [None]:
def accuracy_noise_level(N, X_train, y_train, X_test, y_test):
    
    change = lambda x: 'M' if x == 'B' else 'B'
    def change_rand(N):
        return lambda x: change(x) if random() < N/100 else x
    
    y_train = list(map(change_rand(N), y_train))

    pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', XGBClassifier())])

    # Ajustamos el modelo a los datos de entrenamiento
    pipe_lr.fit(X_train, y_train)

    #Calculamos la precisión del modelo utilizando los datos de test
    return pipe_lr.score(X_test, y_test)

In [None]:
accuracy_noise_level(0, X_train, y_train, X_test, y_test)

In [None]:
accuracies = [accuracy_noise_level(N, X_train, y_train, X_test, y_test) for N in range(40)]

In [None]:
accuracies

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot(range(40), accuracies);

In [None]:
def accuracy_noise_level_robust(N, X_train, y_train, X_test, y_test):
    
    change = lambda x: 'M' if x == 'B' else 'B'
    def change_rand(N):
        return lambda x: change(x) if random() < N/100 else x
    
    y_train = list(map(change_rand(N), y_train))

    pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', XGBClassifier(gamma=5))])

    # Ajustamos el modelo a los datos de entrenamiento
    pipe_lr.fit(X_train, y_train)

    #Calculamos la precisión del modelo utilizando los datos de test
    return pipe_lr.score(X_test, y_test)

In [None]:
accuracies_robust = [accuracy_noise_level_robust(N, X_train, y_train, X_test, y_test) for N in range(40)]

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot(range(40), accuracies);
ax.plot(range(40), accuracies_robust);

In [None]:
def custom_loss(y_true, y_pred):
    grad = (y_true - y_pred)/((y_true - y_pred)**4 + 1)
    hess = (1 - 3*(y_true - y_pred)**4)/((y_true - y_pred)**4 + 1)**2
    return grad, hess
    
    
def custom_loss(y_true, y_pred):
    grad = 2*(y_true - y_pred)
    hess = y_true * 0 + 2
    return grad, hess
    

In [None]:
def accuracy_noise_level_custom_robust(N, X_train, y_train, X_test, y_test):
    
    change = lambda x: 'M' if x == 'B' else 'B'
    def change_rand(N):
        return lambda x: change(x) if random() < N/100 else x
    
    y_train = list(map(change_rand(N), y_train))

    pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', XGBClassifier(objective='multi:softmax'))])

    # Ajustamos el modelo a los datos de entrenamiento
    pipe_lr.fit(X_train, y_train)

    #Calculamos la precisión del modelo utilizando los datos de test
    return pipe_lr.score(X_test, y_test)

In [None]:
accuracies_custom_robust = [accuracy_noise_level_custom_robust(N, X_train, y_train, X_test, y_test) for N in range(40)]

In [None]:
custom_loss(np.array([0,1,1,2]), np.array([0,2,2,2]))

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot(range(40), accuracies);
ax.plot(range(40), accuracies_robust)
ax.plot(range(40), accuracies_custom_robust);