## REGRESIÓN LINEAL MÚLTIPLE - IRIS

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Carga de Datos, Escalado y muestra de etiquetas diferentes

In [2]:
iris = load_iris()
scaler = MinMaxScaler()

x = scaler.fit_transform(iris.data)

np.unique(iris.target)

array([0, 1, 2])

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
np.min(x, axis=0), np.max(x, axis=0)

(array([0., 0., 0., 0.]), array([1., 1., 1., 1.]))

### Binarización de las Etiquetas

In [5]:
d = LabelBinarizer().fit(np.unique(iris.target)).transform(iris.target)

In [6]:
d

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

### Método de HOLD-OUT (2/3 - 1/3)

In [7]:
X_train, X_test, d_train, d_test = train_test_split(x, d, test_size=0.33, 
                                                    random_state=20, stratify=iris.target)

In [8]:
X_train.shape, X_test.shape, d_train.shape, d_test.shape

((100, 4), (50, 4), (100, 3), (50, 3))

### REGRESIÓN LINEAL DE RESPUESTA MÚLTIPLE

In [12]:
d_test.shape

(50, 3)

In [11]:
y_predict = np.zeros(d_test.shape, dtype=float)
for i in range(d_test.shape[1]):
    regresion = LinearRegression().fit(X_train, d_train[:,i])
    y_predict[:,i] = regresion.predict(X_test)
y_predict_test = np.argmax(y_predict, axis=1)

In [17]:
d_test

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1]])

### RESULTADOS RLM

In [10]:
print("Tasa de aciertos =", accuracy_score(np.argmax(d_test, axis=1), np.argmax(y_predict, axis=1)))

print("\nMatriz de Confusión:\n", confusion_matrix(np.argmax(d_test, axis=1), np.argmax(y_predict, axis=1)))

Tasa de aciertos = 0.86

Matriz de Confusión:
 [[16  0  0]
 [ 0 10  7]
 [ 0  0 17]]


In [15]:
np.argmax(d_test, axis=1)

array([1, 0, 2, 1, 1, 1, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 0,
       1, 2, 0, 2, 2, 0, 1, 2, 2, 0, 1, 0, 1, 2, 2, 1, 0, 2, 0, 1, 1, 2,
       0, 1, 2, 2, 0, 2])