# Obtención y preparación de datos

In [None]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

boston = load_boston()

df = pd.DataFrame(boston.data, columns = boston.feature_names)
df['MEDV'] = boston.target[df.index]
df.head()

# Volvamos a revisar el modelo obtenido usando la regresión lineal

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

x = df["LSTAT"].values.reshape(-1,1)
y = df["MEDV"].values.reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.20, random_state=100)

print("Tamaño del conjunto de datos: ", x.size)
print("Tamaño del conjunto de entrenamiento: ", x_train.size)
print("Tamaño del conjunto de prueba: ", x_test.size)

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(x_train,y_train)
y_train_predict = reg.predict(x_train)
y_test_predict = reg.predict(x_test)

x_plot = np.linspace(0,40).reshape(-1, 1)
y_plot = reg.predict(x_plot)

plt.scatter(x_train, y_train, alpha = 0.2)
plt.scatter(x_test, y_test, alpha = 0.8)
plt.plot(x_plot, y_plot,"r--", lw = 4)

plt.xlabel("LSTAT")
plt.ylabel("MEDV")
plt.show()

print("Entrenamiento", r2_score(y_train, y_train_predict))
print("Prueba", r2_score(y_test, y_test_predict))

# Primer acercamiento a la Regresión Polinomial

## Regresión Cuadrática

In [None]:
x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2)
x_poly = poly.fit_transform(x)
reg = LinearRegression().fit(x_poly, y)
y_predict = reg.predict(x_poly)

In [None]:
x_model = np.linspace(0,40).reshape(-1, 1)
y_model = reg.predict(poly.transform(x_model.reshape(-1,1)))

In [None]:
plt.scatter(x, y, alpha = 0.6)
plt.plot(x_model, y_model, "--r", lw = 4)

plt.axis([0, 40, 0, 50])
plt.xlabel("LSTAT")
plt.ylabel("MEDV")

plt.show()

In [None]:
print("R2_score:", r2_score(y, y_predict))
print()

$ F(x) = a x^2 + b x + c $

In [None]:
reg.coef_

In [None]:
a = reg.coef_[0][2]
a

In [None]:
b = reg.coef_[0][1]
b

In [None]:
reg.intercept_

In [None]:
c = reg.intercept_[0]
c

In [None]:
x_propuesta = 30
y_obtenida = a * x_propuesta * x_propuesta + b * x_propuesta + c
print(y_obtenida)

$ F(x) = 0.043546889358 x^2 -2.33282109828x + 42.862007328 $

In [None]:
x_poly = poly.fit_transform([[x_propuesta]])
y_obtenida = reg.predict(x_poly)
print(y_obtenida)

<b>Actualización 2022:</b>
<br>
<br>Se modifica el parametro que recibe la función <b>poly.fit_transform(x_propuesta)</b> por el mismo parametro pero dentro de un doble parentesis cuadrado. Puesto que ahora la función necesita recibir un arreglo de dos dimensiones.
<br>
<br>Quedando de la siguiente manera: <b>poly.fit_transform([[x_propuesta]])</b>

## Regresión Cúbica

In [None]:
x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)

In [None]:
poly = PolynomialFeatures(degree = 3)
x_poly = poly.fit_transform(x)
reg = LinearRegression().fit(x_poly, y)
y_predict = reg.predict(x_poly)

In [None]:
x_model = np.linspace(0,40).reshape(-1, 1)
y_model = reg.predict(poly.transform(x_model.reshape(-1,1)))

In [None]:
plt.scatter(x, y, alpha = 0.6)
plt.plot(x_model, y_model, "--r", lw = 4)

plt.axis([0, 40, 0, 50])
plt.xlabel("LSTAT")
plt.ylabel("MEDV")

plt.show()

In [None]:
print("R2_score:", r2_score(y, y_predict))
print()

$ F(x) = ax^3+bx^2+cx+d $

In [None]:
reg.coef_

In [None]:
a = reg.coef_[0][3]
a

In [None]:
b = reg.coef_[0][2]
b

In [None]:
c = reg.coef_[0][1]
c

In [None]:
d = reg.intercept_[0]
d

In [None]:
x_ = 30
y_obtenida = a * x_ * x_ * x_ + b * x_ * x_  + c * x_  + d
print(y_obtenida)

$ F(x) = -0.0020038676661x^3+0.148738476x^2-3.86559277881x+48.6496253415 $

In [None]:
x_poly = poly.fit_transform([[x_]])
y_obtenida = reg.predict(x_poly)
print(y_obtenida)

<b>Actualización 2022:</b>
<br>
<br>Se modifica el parametro que recibe la función <b>poly.fit_transform(x_)</b> por el mismo parametro pero dentro de un doble parentesis cuadrado. Puesto que ahora la función necesita recibir un arreglo de dos dimensiones.
<br>
<br>Quedando de la siguiente manera: <b>poly.fit_transform([[x_]])</b>

## Un buen habito como programador es reutilizar código y crear funciones.

In [None]:
def plotModel(degree = 1):
    
    global y_predict, poly, reg, x_model, x, y
    
    poly = PolynomialFeatures(degree = degree)
    x_poly = poly.fit_transform(x)
    reg = LinearRegression().fit(x_poly, y)
    y_predict = reg.predict(x_poly)
    
    plt.scatter(x, y, alpha = 0.6)
    
    x_model = np.linspace(0,40).reshape(-1, 1)
    y_model = reg.predict(poly.transform(x_model.reshape(-1,1)))
    plt.plot(x_model, y_model,"--r", lw = 4)
    
    plt.axis([0, 40, 0, 50])
    plt.xlabel("LSTAT")
    plt.ylabel("MEDV")
    
    plt.show()
    
    print("R2_score:", r2_score(y, y_predict))
    print()

In [None]:
x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)

In [None]:
plotModel(5)
plotModel(10)
plotModel(15)
plotModel(20)

# Hagamos nuestro modelo de Regresión Polinomial usando ipywidgets

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)
    
interact(plotModel, degree = (1, 20));

# Validando nuestros modelos usando: Train-Test Split

In [None]:
def plotTrainTestModel(degree = 1, test_size = 0.20):

    global y_predict, poly, reg, x_model, x, y 
           
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size = test_size, random_state=100)
    
    poly = PolynomialFeatures(degree = degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.fit_transform(x_test)
    reg = LinearRegression().fit(x_train_poly, y_train)
    
    y_train_predict = reg.predict(x_train_poly)
    y_test_predict = reg.predict(x_test_poly)
    
    plt.scatter(x_train, y_train, alpha = 0.2)
    plt.scatter(x_test, y_test, alpha = 0.8)
    
    x_model = np.linspace(0,40).reshape(-1, 1)
    y_model = reg.predict(poly.transform(x_model.reshape(-1,1)))
    plt.plot(x_model, y_model, "--r", lw = 4)
    
    plt.axis([0, 40, 0, 50])
    plt.xlabel("LSTAT")
    plt.ylabel("MEDV")
    
    plt.show()
    
    print("Porcentaje de Entrenamiento: ", 1 - test_size)
    print("Porcentaje de Prueba: ", test_size)
    print("Entrenamiento", r2_score(y_train, y_train_predict))
    print("Prueba", r2_score(y_test, y_test_predict))
    print()

In [None]:
x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)

In [None]:
plotTrainTestModel(5)
plotTrainTestModel(10)
plotTrainTestModel(15)
plotTrainTestModel(20)

In [None]:
plotTrainTestModel(5, 0.40)
plotTrainTestModel(10, 0.40)
plotTrainTestModel(15, 0.40)
plotTrainTestModel(20, 0.40)

In [None]:
interact(plotTrainTestModel, degree = (1, 20), test_size = (0.20, 0.90));

# Validando nuestros modelos usando: Cross Validation

In [None]:
from sklearn.model_selection import KFold

def plotCrossValidationModel(degree = 1, showGraph = True, k_ = 4):

    global y_predict, poly, reg, x_model, x, y 
    data = df[["LSTAT","MEDV"]]
    
    k = k_
    kfold = KFold(k, True, 103)
    r2_train_avg, r2_test_avg = 0, 0
    
    for train, test in kfold.split(data):
        
        x_train = data.iloc[train]["LSTAT"].values.reshape(-1,1)
        y_train = data.iloc[train]["MEDV"]
        x_test = data.iloc[test]["LSTAT"].values.reshape(-1,1)
        y_test = data.iloc[test]["MEDV"]
    
        poly = PolynomialFeatures(degree = degree)
        x_train_poly = poly.fit_transform(x_train)
        x_test_poly = poly.fit_transform(x_test)
        reg = LinearRegression().fit(x_train_poly, y_train)
    
        y_train_predict = reg.predict(x_train_poly)
        y_test_predict = reg.predict(x_test_poly)
    
        x_model = np.linspace(0,40).reshape(-1, 1)
        y_model = reg.predict(poly.transform(x_model.reshape(-1,1)))
    
        r2_train = r2_score(y_train, y_train_predict)
        r2_test = r2_score(y_test, y_test_predict)
              
        r2_train_avg += r2_train
        r2_test_avg += r2_test
    
        if(showGraph):
            
            plt.scatter(x_train, y_train, alpha = 0.2)
            plt.scatter(x_test, y_test, alpha = 0.8)
            plt.plot(x_model, y_model, "--r", lw = 4)

            plt.axis([0, 40, 0, 50])
            plt.xlabel("LSTAT")
            plt.ylabel("MEDV")

            plt.show()

            print("Entrenamiento = ", r2_train)
            print("Prueba = ", r2_test)
    
    print()
    print("Entrenamiento promedio =", r2_train_avg/k)
    print("Prueba promedio =", r2_test_avg/k)
    print()

In [None]:
plotCrossValidationModel(5)

In [None]:
plotCrossValidationModel(5, False)
plotCrossValidationModel(10, False)
plotCrossValidationModel(15, False)
plotCrossValidationModel(20, False)

In [None]:
interact(plotCrossValidationModel, degree = (1, 20), k_ = (4,8));