# Obtención y preparación de los datos.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv('data/Real_estate.csv')
df.head()

In [None]:
df = df.drop(['No'], axis = 1)
df = df.drop(['X1 transaction date'], axis = 1)

df.rename(columns = {'X2 house age': 'edad'}, inplace = True)
df.rename(columns = {'X3 distance to the nearest MRT station': 'distancia estacion'}, inplace = True)
df.rename(columns = {'X4 number of convenience stores': 'tiendas cercanas'}, inplace = True)
df.rename(columns = {'X5 latitude': 'latitud'}, inplace = True)
df.rename(columns = {'X6 longitude': 'longitud'}, inplace = True)
df.rename(columns = {'Y house price of unit area': 'precio'}, inplace = True)

df.head()

In [None]:
X = df['distancia estacion'].values.reshape(-1,1)
y = df['precio'].values.reshape(-1,1)

In [None]:
print(X.shape)

# Importando train_test_split
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 100)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
fig = plt.figure()
plt.scatter(X, y)
plt.show()

In [None]:
fig = plt.figure()
plt.scatter(X_train, y_train)
plt.scatter(X_test, y_test)
plt.show()

### Ejemplos Subplot
https://matplotlib.org/devdocs/gallery/subplots_axes_and_figures/subplots_demo.html

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,6))
ax1.scatter(X_train, y_train, c='blue')
ax1.set_title('Train')
ax2.scatter(X_test, y_test, c='orange')
ax2.set_title('Test');

# Visualicemos lo anterior usando una gráfica interactiva.

In [None]:
def split(size = 0.20, state = 100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size, random_state = state)
    fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,6))
    ax1.scatter(X_train, y_train, c='blue')
    ax1.set_title('Train')
    ax2.scatter(X_test, y_test, c='orange')
    ax2.set_title('Test');

In [None]:
split()

In [None]:
split(state = 20)

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
interact(split, size = (0.1, 0.5), state = (1, 100));

# Probando nuestro modelo de regresión con el split de datos.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 50)

### Construyendo el modelo.

In [None]:
model = LinearRegression().fit(X_train, y_train)

x_plot = np.linspace(0,7000).reshape(-1, 1)
y_plot = model.predict(x_plot)

### Probando el modelo con datos conocidos.

In [None]:
# Usando el modelo para realizar predicciones con el conjunto de entrenamiento.
y_train_prediccion = model.predict(X_train)

fig = plt.figure()
plt.axis([0, 7000, 0, 120])
plt.scatter(X_train, y_train, c = 'blue', s = 40)
plt.scatter(X_train, y_train_prediccion, c = 'orange', s = 60)
plt.plot(x_plot, y_plot, "r--")
plt.show()

r2_score(y_train, y_train_prediccion)

### Probando el modelo con datos desconocidos.

In [None]:
# Usando el modelo para realizar predicciones con el conjunto de entrenamiento.
y_test_prediccion = model.predict(X_test)

fig = plt.figure()
plt.axis([0, 7000, 0, 120])
plt.scatter(X_test, y_test, c = 'blue', s = 40)
plt.scatter(X_test, y_test_prediccion, c = 'orange', s = 60)
plt.plot(x_plot, y_plot, "r--")
plt.show()

r2_score(y_test, y_test_prediccion)

# Validación cruzada.
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold

In [None]:
from sklearn.model_selection import KFold

In [None]:
kfold = KFold(4, shuffle = False)

In [None]:
for train, test in kfold.split(X):
    print("Conjunto de Entrenamiento")
    print(train)
    print()
    print("Conjunto de Prueba")
    print(test)
    print()
    print("------------------------------")
    print()

In [None]:
for train, test in kfold.split(X):
    
    # Datos entrenamiento.
    X_train = X[train]
    y_train = y[train]
    
    # Datos prueba.
    X_test = X[test]
    y_test = y[test]
    
    # Grafica de los datos.
    fig = plt.figure()
    plt.scatter(X_train, y_train)
    plt.scatter(X_test, y_test)
    plt.show()
    
    print()

In [None]:
divisiones = 4

kfold = KFold(divisiones, shuffle = True, random_state= 100)

r2_train_avg = 0
r2_test_avg = 0

for train, test in kfold.split(X):
    
    # Datos entrenamiento.
    X_train = X[train]
    y_train = y[train]
    
    # Datos prueba.
    X_test = X[test]
    y_test = y[test]
    
    # Construcción del modelo con los datos de entrenamiento.
    model = LinearRegression().fit(X_train, y_train)
    
    x_plot = np.linspace(0,7000).reshape(-1, 1)
    y_plot = model.predict(x_plot)

    y_train_prediccion = model.predict(X_train)
    y_test_prediccion = model.predict(X_test)
    
    fig = plt.figure()
    plt.scatter(X_train, y_train, label = 'Entrenamiento')
    plt.scatter(X_test, y_test, label = 'Prueba')
    plt.legend()
    plt.plot(x_plot, y_plot, "r--")
    plt.show()

    r2_train = r2_score(y_train, y_train_prediccion)
    r2_test = r2_score(y_test, y_test_prediccion)
    
    r2_train_avg = r2_train_avg + r2_train
    r2_test_avg = r2_test_avg + r2_test
    
    print(r2_train)
    print(r2_test)
    print()
    print("-------------------------------------------------------------")
    print()
    
print(r2_train_avg / divisiones)
print(r2_test_avg / divisiones)