# Construcción de un modelo de regresión lineal para un conjunto de puntos aleatorios

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")

Vamos a construir un conjunto de puntos aleatorios que esten cercanos a la recta:
$$y = 2 x + 3$$

In [None]:
rng = np.random.RandomState(100)
x = 10 * rng.rand(100)
y = 2 * x + 3 + rng.randn(100)

In [None]:
plt.scatter(x,y, c = 'red')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept = True)
model.fit(x[:, np.newaxis], y)

In [None]:
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])

In [None]:
plt.scatter(x, y, c = 'red', alpha = 0.2)
plt.plot(xfit, yfit, lw = 3)

plt.show()

In [None]:
print("Pendiente del módelo (m): ", model.coef_[0])
print("Intersección del módelo (b):", model.intercept_)

# Obtención y preparación de datos

In [None]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
df = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
df['MEDV'] = boston_dataset.target[df.index]
df.head()

In [None]:
correlation = df.corr()
sns.set(style="ticks", color_codes=True)
plt.figure(figsize=(14, 12))
sns.heatmap(correlation,annot=True, linewidths=.5, cmap="YlGnBu")

In [None]:
cor_target = abs(correlation["MEDV"])
relevant_features = cor_target[cor_target>0.5]
relevant_features

In [None]:
df.corr()["MEDV"].sort_values()

In [None]:
x = df["LSTAT"].values.reshape(-1, 1)
y = df["MEDV"].values.reshape(-1, 1)

In [None]:
x

In [None]:
y

# Train Test Split

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20, random_state=100)

<center><img src="img/TrainTestSplit.png" width = "80%"></center>

In [None]:
print("Tamaño del conjunto de datos: ", x.size)
print("Tamaño del conjunto de entrenamiento: ", x_train.size)
print("Tamaño del conjunto de prueba: ", x_test.size)

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(x_train,y_train)

y_train_predict = reg.predict(x_train)
y_test_predict = reg.predict(x_test)

In [None]:
plt.scatter(x_train, y_train, c = "blue", alpha = 0.2)
plt.scatter(x_test, y_test, c = "orange", alpha = 0.8)

x_plot = np.linspace(0,40).reshape(-1, 1)
y_plot = reg.predict(x_plot)

plt.plot(x_plot, y_plot,"r--", lw = 4)
plt.show()

In [None]:
from sklearn.metrics import r2_score
print("Entrenamiento", r2_score(y_train, y_train_predict))
print("Prueba", r2_score(y_test, y_test_predict))

## Ahora hagamos una función usando lo anterior.

In [None]:
def graficaRegresion(size):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size = size, random_state=100)

    print("Tamaño del conjunto de datos: ", x.size)
    print("Tamaño del conjunto de entrenamiento: ", x_train.size)
    print("Tamaño del conjunto de prueba: ", x_test.size)

    reg = LinearRegression().fit(x_train,y_train)

    y_train_predict = reg.predict(x_train)
    y_test_predict = reg.predict(x_test)

    plt.scatter(x_train, y_train, c = 'blue', alpha = 0.2)
    plt.scatter(x_test, y_test, c = 'orange', alpha = 0.8)

    x_plot = np.linspace(0,40).reshape(-1, 1)
    y_plot = reg.predict(x_plot)

    plt.plot(x_plot, y_plot,"r--", lw = 4)
    plt.show()

    print("Entrenamiento", r2_score(y_train, y_train_predict))
    print("Prueba", r2_score(y_test, y_test_predict))

In [None]:
graficaRegresion(0.30)

In [None]:
graficaRegresion(0.40)

In [None]:
graficaRegresion(0.50)

# Regresión Múltiple

In [None]:
x = df[["RM","AGE"]]
y = df["MEDV"].values.reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20, random_state=100)

reg = LinearRegression().fit(x_train,y_train)

y_train_predict = reg.predict(x_train)
y_test_predict = reg.predict(x_test)

print("Entrenamiento", r2_score(y_train, y_train_predict))
print("Prueba", r2_score(y_test, y_test_predict))

In [None]:
%matplotlib notebook

In [None]:
from mpl_toolkits.mplot3d import Axes3D

x_ = df["RM"]
y_ = df["AGE"]
z_ = df["MEDV"]

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_, y_, z_, c='r', marker='.')
plt.show()

In [None]:
def regresionMultiple():
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.20, random_state=100)

    reg = LinearRegression().fit(x_train,y_train)

    y_train_predict = reg.predict(x_train)
    y_test_predict = reg.predict(x_test)

    print("Entrenamiento", r2_score(y_train, y_train_predict))
    print("Prueba", r2_score(y_test, y_test_predict))

In [None]:
x = df[["RM","LSTAT"]]
y = df["MEDV"].values.reshape(-1, 1)

regresionMultiple()

In [None]:
x = df[["RM","LSTAT", "AGE"]]
y = df["MEDV"].values.reshape(-1, 1)

regresionMultiple()

In [None]:
x = df[["RM","LSTAT", "AGE", "PTRATIO"]]
y = df["MEDV"].values.reshape(-1, 1)

regresionMultiple()