## **_PRÁCTICA 8 - TAA_**

_Pablo Martín de Benito_

***


In [1]:
import sklearn as sk 
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import scipy
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
optical_recognition_of_handwritten_digits = fetch_ucirepo(id=80) 
  
# data (as pandas dataframes) 
X = optical_recognition_of_handwritten_digits.data.features 
y = optical_recognition_of_handwritten_digits.data.targets 



In [29]:
class rlm(object):
    def __init__(self):
        self.modelos = []
        self.predicciones = []
        self.probabilidades = []
    
    def fit(self, X, y):
        self.nclases = np.unique(y).shape[0]        # No es un vector, es un número de clases
        onehot = OneHotEncoder(sparse_output=False)

        # Obtenemos la salida deseado de cada clasificador individual
        y_onehot = onehot.fit_transform(y.reshape(-1,1))     # El encoder exige que la entrada tenga dos dimensiones
        print("\nSalida Deseada:    \n",y_onehot)

        for i in range(self.nclases):              # Se crea un objeto regresión para cada clase y se predice para cada una si es de esa clase o no
            self.modelos.append(LinearRegression())
            self.modelos[-1].fit(X,y_onehot[:,i])
        
    def predict(self,test):
        predicciones_codificadas = []
        
        for modelo in self.modelos:
            pred = modelo.predict(test)
            self.predicciones.append(pred)
        
        self.predicciones = np.array(self.predicciones).T
        
        mat_pred = self.predicciones
        for i in mat_pred:
            pred_max = np.argmax(i)

            pred_bin = np.zeros_like(i)
            pred_bin[pred_max] = 1
            predicciones_codificadas.append(pred_bin)
        
        predicciones_codificadas = np.array(predicciones_codificadas)
        return(predicciones_codificadas)
        
    def predict_proba(self):
        for elem in self.predicciones:
            self.probabilidades.append(self.softmax(elem))
        
        return(np.array(self.probabilidades))

    def score(self,X,y):
        onehot = OneHotEncoder(sparse_output=False)
        y_deseada = onehot.fit_transform(y.reshape(-1,1)) 

        self.predicciones = []
        preds = self.predict(X)

        # No utilizamos fit porque el fit lo haces con el entrenamiento y el score con el test
        
        return(accuracy_score(y_deseada,preds))


    
    def softmax(self,z):
        z = z - np.max(z)  # Restar el máximo valor para evitar el desbordamiento numérico
        exp_z = np.exp(z)   # Calcular el exponente de cada elemento en z
        softmax_output = exp_z / np.sum(exp_z)  # Aplicar softmax
        
        return softmax_output



Prueba método fit()

In [30]:
o = rlm()
X_array = np.array(X)
y_array = np.array(y)
o.fit(X_array,y_array)




Salida Deseada:    
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]]


In [5]:
o.modelos[0].predict(X_array)

array([ 0.76386117,  1.07540944,  0.05982067, ...,  0.04871539,
        0.26843784, -0.10737006])

Prueba método predict()

In [6]:
o.predict(X_array)


array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

Prueba método predict_proba()

In [7]:
o.predict_proba()


array([[0.1874531 , 0.07047201, 0.08566112, ..., 0.07907059, 0.09577188,
        0.0962087 ],
       [0.24576241, 0.06985995, 0.08556162, ..., 0.07810133, 0.08269697,
        0.08684725],
       [0.09066307, 0.0831911 , 0.09572556, ..., 0.22682851, 0.09115263,
        0.07267071],
       ...,
       [0.09147612, 0.10740303, 0.08890767, ..., 0.07702298, 0.18388516,
        0.08770021],
       [0.11641502, 0.10648952, 0.09032758, ..., 0.08322797, 0.10007171,
        0.15075447],
       [0.0796674 , 0.08484972, 0.09268402, ..., 0.07996826, 0.14770502,
        0.11290596]])

In [8]:
o.score(X_array,y_array)

0.9382562277580071

Método de Hold-Out sobre el conjunto de datos


In [32]:
hold_out = rlm()

X_train = X_array[0:3747,:]
y_train = y_array[0:3747]

X_test = X_array[3747:5620,:]
y_test = y_array[3747:5620]

hold_out.fit(X_train,y_train)
hold_out.score(X_test,y_test)


Salida Deseada:    
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


0.9263214095034704

Validación Cruzada dos particiones

In [72]:
skf = StratifiedKFold(n_splits=5)


scores_rlm = []
scores_clf = []
for i, (train_index,test_index) in enumerate(skf.split(X_array,y_array)):
    print("\nFold ",i,": ")
    
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for elem_train, elem_test in zip(train_index,test_index):
        X_train.append(X_array[elem_train,:])
        y_train.append(y_array[elem_train])

        X_test.append(X_array[elem_test,:])
        y_test.append(y_array[elem_test])
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    reg = rlm()
    clf = LogisticRegression(max_iter=10000)        

    reg.fit(X_train,y_train)
    clf.fit(X_train,y_train.reshape(-1))

    print("\nFunción rlm: ")
    score_rlm = reg.score(X_test,y_test)
    scores_rlm.append(score_rlm)
    print("Score:   ",score_rlm)

    print("\nRegresión Logística sklearn:")
    score_clf = clf.score(X_test,y_test.reshape(-1))
    scores_clf.append(score_clf)
    print("Score:   ",score_clf)

print("\nTasa de acierto media rlm:     ",np.mean(scores_rlm))
print("\nTasa de acierto media clf:     ",np.mean(scores_clf))


Fold  0 : 

Salida Deseada:    
 [[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]

Función rlm: 
Score:    0.9261565836298933

Regresión Logística sklearn:
Score:    0.9519572953736655

Fold  1 : 

Salida Deseada:    
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Función rlm: 
Score:    0.8905693950177936

Regresión Logística sklearn:
Score:    0.9457295373665481

Fold  2 : 

Salida Deseada:    
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]

Función rlm: 
Score:    0.9279359430604982

Regresión Logística sklearn:
Score:    0.949288256227758

Fold  3 : 

Salida Deseada:    
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 