In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
# Cargamos California housing dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

print(df.columns)
df.head()


Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
print(data.DESCR)
#verificando que representa cada columna

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [4]:
# variables
x = df.drop('MedHouseVal', axis=1) #eliminamos la columna a predecir
y = df['MedHouseVal'] #columna a predecir

# Dividir el conjunto de datos en entrenamiento (75%), validación (15%) y prueba (10%)
x_temp, x_test, y_temp, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

#wntrenamiento 75% y validacion 15% del dataset
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.1667, random_state=42) # 0.1667 * 0.9 = 0.15




In [None]:
#Verificamos el tamaño de los conjuntos

print(f'Tamaño del conjunto de entrenamiento: {x_train.shape[0]} muestras')
print(f'Tamaño del conjunto de validación: {x_val.shape[0]} muestras')
print(f'Tamaño del conjunto de prueba: {x_test.shape[0]} muestras')

Tamaño del conjunto de entrenamiento: 15479 muestras
Tamaño del conjunto de validación: 3097 muestras
Tamaño del conjunto de prueba: 2064 muestras


In [6]:
train = pd.concat([x_train, y_train], axis=1)
val = pd.concat([x_val, y_val], axis=1)
test = pd.concat([x_test, y_test], axis=1)
display(train.head())
display(val.head())
display(test.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
10334,6.9544,4.0,9.28988,1.406518,1753.0,3.006861,33.85,-117.74,3.14
1557,7.9168,17.0,8.606299,1.097113,1222.0,3.207349,37.8,-121.97,3.56
12777,1.8611,42.0,4.751381,1.055249,1069.0,2.953039,38.64,-121.42,0.605
4263,1.3157,43.0,1.911826,1.151854,3049.0,2.13366,34.1,-118.33,3.333
17373,4.8155,24.0,6.631961,1.002421,1171.0,2.835351,34.96,-120.43,1.629


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
9721,5.2729,13.0,6.559055,1.023622,405.0,3.188976,36.9,-121.68,3.229
13239,6.3842,17.0,7.933661,0.995086,1289.0,3.167076,34.13,-117.66,3.071
2461,2.8125,29.0,6.078571,1.075,914.0,3.264286,36.46,-119.69,0.792
8593,4.4211,44.0,5.237154,0.968379,669.0,2.644269,33.88,-118.37,3.24
6457,2.7656,36.0,5.286408,1.177184,914.0,2.218447,34.11,-118.06,2.395


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
20046,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.477
3024,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46,0.458
15663,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.8,-122.44,5.00001
20484,5.7376,17.0,6.163636,1.020202,1705.0,3.444444,34.28,-118.72,2.186
9814,3.725,34.0,5.492991,1.028037,1063.0,2.483645,36.62,-121.93,2.78


#Entrenamiento, metricas y comparación de rendimiento de Entrenamiento vs Validación:

In [7]:
from sklearn.metrics import root_mean_squared_error
features_orig = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

x_train = train[features_orig]
y_train = train['MedHouseVal']

x_val = val[features_orig]
y_val = val['MedHouseVal']

modelo_lineal = LinearRegression()
modelo_lineal.fit(x_train, y_train)

#obtenemos las metricas de validación
y_val_pred = modelo_lineal.predict(x_val)
mse = mean_squared_error(y_val, y_val_pred)
rmse = root_mean_squared_error(y_val, y_val_pred)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print('Métricas en validación:')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R²: {r2:.3f}')

#Obtenemos el r^2 de el conjunto de entrenamiento
y_train_pred = modelo_lineal.predict(x_train)
r2train = r2_score(y_train,y_train_pred)

#comparando el rendimiento del modelo entrenamiento vs validacion
print("R² entrenamiento vs R² validacion: ")
print(f"R² entrenamiento: {r2train:.3f}")
print(f"R² validacion: {r2:.3f}")



Métricas en validación:
MSE: 0.53
RMSE: 0.73
MAE: 0.53
R²: 0.586
R² entrenamiento vs R² validacion: 
R² entrenamiento: 0.613
R² validacion: 0.586


# La diferencia de R² entre entrenamiento y validación es relativamente baja lo que sugiere que no hay sobreajuste severo
## El modelo de regresión lineal presenta un R² de 0.613, no es un mal valor pero tampoco dentro de lo deseado, se puede concluir que el modelo puede no mostrar un mejor rendimiento por presencia de ruido, relaciones no lineales o outliders pero aun así no presenta un rendimiento mediocre