In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline


In [67]:
housing = pd.read_csv("./data/housing.csv")
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [68]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [69]:
housing.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
housing_median_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
total_rooms,20640.0,2635.763081,2181.615252,2.0,1447.75,2127.0,3148.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
households,20640.0,499.53968,382.329753,1.0,280.0,409.0,605.0,6082.0
median_income,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
median_house_value,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


In [70]:
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    housing.drop(columns="median_house_value"), # features
    housing["median_house_value"], # target
    stratify=pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]),
    test_size=0.2, random_state=42
    )


null_rows_idx = X_train.isnull().any(axis=1) # índices de las filas con valores nulos
X_train.loc[null_rows_idx].head() # visualizamos las primeras filas con valores nulos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1606,-122.08,37.88,26.0,2947.0,,825.0,626.0,2.933,NEAR BAY
10915,-117.87,33.73,45.0,2264.0,,1970.0,499.0,3.4193,<1H OCEAN
19150,-122.7,38.35,14.0,2313.0,,954.0,397.0,3.7813,<1H OCEAN
4186,-118.23,34.13,48.0,1308.0,,835.0,294.0,4.2891,<1H OCEAN
16885,-122.4,37.58,26.0,3281.0,,1145.0,480.0,6.358,NEAR OCEAN


In [72]:
k_value = np.sqrt(X_train.shape[0]).astype(int)
k_value

128

In [73]:
X_train_num = X_train.select_dtypes(include=[np.number]) 
scaler = StandardScaler().set_output(transform="pandas") # Para que el resultado sea un DataFrame
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_train_num_scaled

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752
...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261


In [74]:
cat_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
housing_cat_ohe = cat_encoder.fit_transform(X_train[["ocean_proximity"]])
housing_cat_ohe

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,0.0,1.0,0.0,0.0,0.0
15502,0.0,0.0,0.0,0.0,1.0
2908,0.0,1.0,0.0,0.0,0.0
14053,0.0,0.0,0.0,0.0,1.0
20496,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
15174,1.0,0.0,0.0,0.0,0.0
12661,0.0,1.0,0.0,0.0,0.0
19263,1.0,0.0,0.0,0.0,0.0
19140,1.0,0.0,0.0,0.0,0.0


In [75]:
X_train_tr1 = pd.concat([X_train_num_scaled, housing_cat_ohe], axis=1)
X_train_tr1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0


In [76]:
X_train_imputed_a = KNNImputer(n_neighbors=k_value).set_output(transform="pandas").fit_transform(X_train_tr1)

print(X_train_imputed_a.isna().any().any()) # Verificamos que no hay valores nulos
X_train_imputed_a

False


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0


In [77]:
X_train_imputed_a.loc[null_rows_idx].head() # visualizamos las filas que tenían valores nulos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
1606,-1.251077,1.048079,-0.211016,0.151734,0.080575,-0.533051,0.343342,-0.494985,0.0,0.0,0.0,1.0,0.0
10915,0.852065,-0.89308,1.299986,-0.167671,-0.046215,0.493276,0.005292,-0.239693,1.0,0.0,0.0,0.0,0.0
19150,-1.560803,1.267921,-1.165333,-0.144756,-0.238985,-0.417421,-0.266212,-0.049654,1.0,0.0,0.0,0.0,0.0
4186,0.672224,-0.705981,1.538566,-0.614744,-0.577098,-0.524088,-0.540378,0.216926,1.0,0.0,0.0,0.0,0.0
16885,-1.410935,0.907754,-0.211016,0.307929,-0.168272,-0.246217,-0.045282,1.303035,0.0,0.0,0.0,0.0,1.0


## Creando un pipeline

In [78]:
transformer1 = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object)),
)
pipeline = make_pipeline(transformer1, KNNImputer(n_neighbors=k_value)) # creamos un pipeline con las transformaciones previas y la imputación

X_train_imputed_b = pipeline.fit_transform(X_train) # Aplicamos el pipeline a los datos de entrenamiento

## Comprobación de la solución

Podemos también convertir de nuevo a DataFrame y comparar los resultados procesados por los dos métodos.

In [79]:
X_train_imputed_b = pd.DataFrame( # convertimos el resultado a DataFrame
    X_train_imputed_b,
    columns=transformer1.get_feature_names_out(), index=X_train.index)
    #columns=X_train_imputed_a.keys(), index=X_train.index)

print("¿Hay valores nulos?", X_train_imputed_b.isna().any().any()) # Verificamos que no hay valores nulos

# Comprobamos que X_train_imputed_a y X_train_imputed_b son iguales
print("¿Son los resultados iguales paso a paso y con el pipeline?",(X_train_imputed_a.values==X_train_imputed_b.values).all())

X_train_imputed_b.head().T

¿Hay valores nulos? False
¿Son los resultados iguales paso a paso y con el pipeline? True


Unnamed: 0,12655,15502,2908,14053,20496
standardscaler__longitude,-0.94135,1.171782,0.267581,1.221738,0.437431
standardscaler__latitude,1.347438,-1.19244,-0.125972,-1.351474,-0.635818
standardscaler__housing_median_age,0.027564,-1.722018,1.22046,-0.370069,-0.131489
standardscaler__total_rooms,0.584777,1.261467,-0.469773,-0.348652,0.427179
standardscaler__total_bedrooms,0.635123,0.775677,-0.545045,-0.038567,0.269198
standardscaler__population,0.732602,0.533612,-0.674675,-0.467617,0.37406
standardscaler__households,0.556286,0.721318,-0.524407,-0.037297,0.220898
standardscaler__median_income,-0.893647,1.292168,-0.525434,-0.865929,0.325752
onehotencoder__ocean_proximity_<1H OCEAN,0.0,0.0,0.0,0.0,1.0
onehotencoder__ocean_proximity_INLAND,1.0,0.0,1.0,0.0,0.0
