# Práctica 11

# Redes neuronales de Funciones de Base Radial (RBF) y Máquinas de Vectores Soporte (SVM)

## Santiago Blasco Arnaiz

In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from scipy.stats import mode
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
import warnings

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Descargamos el dataset
data = load_breast_cancer()
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [4]:
#Estandarizamos los datos
X = StandardScaler().fit_transform(data.data)
Y = data.target

In [5]:
#Número de carpetas
K = 10
skf = StratifiedKFold(n_splits = K, random_state = 0)

In [6]:
max_clusters = 30
beginRBF = 2
#Array que contiene la media de la tasa de aciertos con cada XV
hitsRBF = np.zeros(max_clusters - 1, dtype = 'float')

#Para cada número de clusters realizamos XV
for k in range (beginRBF, max_clusters + 1):
    kmeans = KMeans(n_clusters = k, random_state = 0)
    
    for train_index, test_index in skf.split(X, Y):
        #Separamos en train y test
        X_train = X[train_index]
        Y_train = Y[train_index]
        
        X_test = X[test_index]
        Y_test = Y[test_index]
        
        #Entrenamos
        kmeans.fit(X_train)
        
        modeArray = np.ones(k, dtype = 'int')
        
        #Hayamos la moda (clase) de cada cluster
        for i in range(k):
            modeArray[i] = mode(Y_train[np.argwhere(kmeans.labels_ == i)])[0]
            
        #Clase para cada instancia asignada en función de la moda de cada cluster
        Y_predict = modeArray[kmeans.predict(X_test)]
        hitsRBF[k - beginRBF] += accuracy_score(Y_test, Y_predict)
        
    hitsRBF[k - beginRBF] /= K

Mediante el algoritmo aglomerativo kmeans obtenemos el número de clusters que mejor tasa de acierto obtiene para poder utilizar este en la RBF.

In [7]:
dfArray = np.arange(beginRBF, max_clusters + 1)
dfArray = np.column_stack((dfArray, hitsRBF))

df = pd.DataFrame(data = dfArray, columns = ['Nº clusters', 'Tasa de acierto (XV-10P)'])
df = df.astype({'Nº clusters': int, 'Tasa de acierto (XV-10P)': float})
df = df.style.hide_index()
df

Nº clusters,Tasa de acierto (XV-10P)
2,0.912155
3,0.873465
4,0.884054
5,0.92265
6,0.924436
7,0.929699
8,0.920927
9,0.924405
10,0.910401
11,0.927976


In [8]:
sizeHidden = np.argmax(hitsRBF) + beginRBF
print('El mejor resultado (%.4f)' % hitsRBF[np.argmax(hitsRBF)],'se obtiene con %i' % sizeHidden,'clusters, es decir, tendremos %i neuronas en la capa oculta.' % sizeHidden)

El mejor resultado (0.9526) se obtiene con 21 clusters, es decir, tendremos 21 neuronas en la capa oculta.


In [9]:
from sklearn.svm import SVC
from sklearn.metrics.pairwise import euclidean_distances

In [10]:
#Número de carpetas
K = 10
skf = StratifiedKFold(n_splits = K , random_state = 0)

In [11]:
max_degree = 5
beginSVM = 1
#Array que contiene la media de la tasa de aciertos con cada XV
hitsSVM = np.zeros(max_degree, dtype = 'float')

#Para cada grado de polinomio realizamos XV
for degree in range (beginSVM, max_degree + 1):

    for train_index, test_index in skf.split(X, Y):
        #Separamos en train y test
        X_train = X[train_index]
        Y_train = Y[train_index]
        
        X_test = X[test_index]
        Y_test = Y[test_index]
        
        #Cálculo de los parámetros de la RBF
        kmeans = KMeans(n_clusters = sizeHidden).fit(X_train)
        
        centers = kmeans.cluster_centers_

        distances = euclidean_distances(centers)

        sigma = np.sum(distances, axis = 0) / (centers.shape[0]-1)

        
        X_SVM_train = np.zeros((X_train.shape[0], centers.shape[0]), dtype = 'float')
        
        #Cálculamos los valores de la RBF para los datos de entrenamiento
        for i in range(X_train.shape[0]):
            for j in range(centers.shape[0]):
                X_SVM_train[i][j] = np.exp(-np.sum((X_train[i] - centers[j])**2)/(2.0 * (sigma[j]**2)))
        
        
        X_SVM_test = np.zeros((X_test.shape[0], centers.shape[0]), dtype = 'float')
        
        #Cálculamos los valores de la RBF para los datos de test
        for i in range(X_test.shape[0]):
            for j in range(centers.shape[0]):
                X_SVM_test[i][j] = np.exp(-np.sum((X_test[i] - centers[j])**2)/(2.0 * (sigma[j]**2)))
        
        
        #Creamos el clasificador
        svc = SVC(kernel = 'poly',degree = degree, random_state = 0)
        #Entrenamos el clasificador
        svc.fit(X_SVM_train,Y_train)        

        hitsSVM[degree - beginSVM] += svc.score(X_SVM_test,Y_test)
        
    hitsSVM[degree - beginSVM] /= K

Entrenamos y testeamos el clasificador que hemos creado. Los conjuntos de train y test son generados con la RBF y realizamos XV con cada valor que puede tomar el grado del polinomio para ver cual da mejores resultados.

In [12]:
dfArray = np.arange(beginSVM, max_degree + 1)
dfArray = np.column_stack((dfArray, hitsSVM))

df = pd.DataFrame(data = dfArray, columns = ['Grado', 'Tasa de acierto (XV-10P)'])
df = df.astype({'Grado': int, 'Tasa de acierto (XV-10P)': float})
df = df.style.hide_index()
df

Grado,Tasa de acierto (XV-10P)
1,0.961341
2,0.963095
3,0.968358
4,0.975407
5,0.957801


In [13]:
bestDegree = np.argmax(hitsSVM) + beginSVM
print('El mejor resultado (%.4f)' % hitsSVM[np.argmax(hitsSVM)], 'se obtiene con grado %i para el polinomio del núcleo.' % bestDegree)

El mejor resultado (0.9754) se obtiene con grado 4 para el polinomio del núcleo.
