In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
data = pd.read_csv("housing.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
data["income_cat"] = pd.cut(data.median_income, bins = np.array([0, 1.5, 3, 4.5, 6, np.inf]),
                           labels = np.arange(1,6))
#creando los indicadores según el ingreso medio

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

In [7]:
#creando los conjuntos de entrenamiento y prueba
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(data, data.income_cat):
    train_set = data.iloc[train_index]
    test_set = data.iloc[test_index]

In [8]:
#borrando el indicador creado en los conjuntos ya definidos
for set_ in (train_set, test_set): 
    set_.drop("income_cat", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
#importando las clases que se van a neceitar para la preparación de los datos
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [10]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16354 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


rooms_per_household 
population_per_household
bedrooms_per_room

In [11]:
#creando la clase para el transformador personalizado

class Custom(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, 3] / X[:, 6]
        population_per_household = X[:, 5] / X[:, 6]
        bedrooms_per_room = X[:, 4] / X[:, 3]
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [12]:
#creando el pipeline para los atributos numericos
numpipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")), 
    ("new_attrs", Custom()),
    ("scale", StandardScaler())
])

In [13]:
#quitando los valores de la variable respuesta 
housing = train_set.drop(columns = ["median_house_value"])
housing_labels = train_set.median_house_value.copy()

In [14]:
#creando el transformador en simultaneo 
from sklearn.compose import ColumnTransformer

#extrayendo atributos de cada clase
num = list(housing)[:-1]
cat = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", numpipeline, num), 
    ("cat", OneHotEncoder(), cat)
])

In [15]:
housing_prepared = full_pipeline.fit_transform(housing)

## Ejercicio 1

In [16]:
#creando los objetos de la clase SVR para ajustarlos con diferentes hiperparametros
from sklearn.svm import SVR
sv1 = SVR(kernel = "linear", C = 0.5)
sv2 = SVR(kernel = "rbf", gamma = "auto")
sv3 = SVR(kernel = "rbf", C = 1.25)
sv4 = SVR(kernel = "linear", gamma = "auto")

In [17]:
#ajustando los predictores con el metodo fit de la clase SVR
sv1.fit(housing_prepared, housing_labels)
sv2.fit(housing_prepared, housing_labels)
sv3.fit(housing_prepared, housing_labels)
sv4.fit(housing_prepared, housing_labels)

SVR(gamma='auto', kernel='linear')

In [18]:
from sklearn.metrics import mean_squared_error as mse

In [19]:
#errores sobre el mismo conjunto de datos
errors = [mse(sv1.predict(housing_prepared), housing_labels),
mse(sv2.predict(housing_prepared), housing_labels),
mse(sv3.predict(housing_prepared), housing_labels),
mse(sv4.predict(housing_prepared), housing_labels)]

In [20]:
counter = 1
for error in errors:
    print(f"the Mean Root Error of the SVR number {counter} is {np.sqrt(error)}")
    counter += 1

the Mean Root Error of the SVR number 1 is 114783.24481875032
the Mean Root Error of the SVR number 2 is 118577.43356412371
the Mean Root Error of the SVR number 3 is 118490.9969603456
the Mean Root Error of the SVR number 4 is 111094.6308539982


In [21]:
#usando k-fold cross validation
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    scores = np.sqrt(-scores)
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [22]:
svrs = [
SVR(kernel = "linear", C = 0.5),
SVR(kernel = "rbf", gamma = "auto"),
SVR(kernel = "rbf", C = 1.25),
SVR(kernel = "linear", gamma = "auto")
]

In [23]:
for machine in svrs:
    score = cross_val_score(machine, housing_prepared,  housing_labels, 
                           cv = 10, scoring = "neg_mean_squared_error")
    display_scores(score)

Scores: [108367.74151582 115959.28059807 113501.53452598 116899.04847892
 114105.7235789  118996.07243651 114127.99044335 117944.67740229
 117020.37482429 114779.59582688]
Mean: 115170.20396310164
Standard deviation: 2842.525377445233
Scores: [111393.33263237 119546.71049753 116961.00489445 120449.0155974
 117622.20149716 122303.76986818 117640.09907103 121459.63518806
 120348.51364519 118025.61954959]
Mean: 118574.99024409598
Standard deviation: 2934.1329433145675
Scores: [111318.59887512 119448.73860597 116879.83568467 120368.36415403
 117530.0985779  122214.45389338 117555.52653298 121396.78405107
 120268.87957107 117933.66786749]
Mean: 118491.4947813692
Standard deviation: 2933.5737961815034
Scores: [105342.09141998 112489.24624123 110092.35042753 113403.22892482
 110638.90119657 115675.8320024  110703.56887243 114476.89008206
 113756.17971227 111520.1120808 ]
Mean: 111809.84009600841
Standard deviation: 2762.393664321567


# Ejercicio 2

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

rf = RandomForestRegressor()

grid_search = RandomizedSearchCV(rf, grid, cv = 5, scoring = "neg_mean_squared_error",
                                return_train_score = True)
grid_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions=[{'max_features': [2, 4, 6, 8],
                                         'n_estimators': [3, 10, 30]},
                                        {'bootstrap': [False],
                                         'max_features': [2, 3, 4],
                                         'n_estimators': [3, 10]}],
                   return_train_score=True, scoring='neg_mean_squared_error')

In [27]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

# Ejercicio 3

In [79]:
from sklearn.model_selection import GridSearchCV

In [102]:
rf3 = RandomForestRegressor()

numerical_pipe = Pipeline([
    ("Impute", SimpleImputer(strategy = "median")),
    ("Add", Custom()),
    ("Standard", StandardScaler())
])

numerical_variables = list(housing)[:-1]
categorical_variables = [list(housing)[-1]]
added_varibles = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]

all_pipe = ColumnTransformer([
    ("Numerical", numerical_pipe, numerical_variables),
    ("Categorical", OneHotEncoder(), categorical_variables)
])

class Most(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_attributes = None ,factor_categories = None, y = None, model = None):
        self.numerical_attributes = numerical_attributes
        self.factor_categories = factor_categories
        self.y = y
        self.model = model
    def fit(self, X):
        return self
    def transform(self, X):
        grid = [
                {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
                {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
                ]
        grid_search = GridSearchCV(self.model, grid, cv = 5, scoring = "neg_mean_squared_error")
        grid_search.fit(X, self.y)
        feature_importances = grid_search.best_estimator_.feature_importances_
        attributes = self.numerical_attributes + self.factor_categories.tolist()
        chosen = sorted(zip(feature_importances, attributes), reverse = True)[:5] 
        return [tup[1] for tup in chosen] 

features_pipe = Pipeline([
    ("Transform", all_pipe),
    ("Getting_attrs", Most(numerical_attributes = numerical_variables + added_variables,
                           factor_categories = housing.ocean_proximity.unique(),
                           y = housing_labels, model = rf3))
])

In [103]:
features_pipe.fit_transform(housing)

['median_income',
 'NEAR OCEAN',
 'population_per_household',
 'bedrooms_per_room',
 'longitude']

# Ejercicio 4

In [30]:
#creacion de un pipeline para los atributos numericos

numerical_pipeline =  Pipeline([
    ("impute", SimpleImputer(strategy = "median")),
    ("adding_attrs", Custom()),
    ("Standarizing", StandardScaler())
])

In [45]:
#creando un columtransformer para procesar los datos categoricos junto con los numericos

num_vars = list(housing)[:-1]
cat_vars = [list(housing)[-1]]
mixed_transformer = ColumnTransformer([
    ("Numerical", numerical_pipeline, num_vars),
    ("Categorical", OneHotEncoder(), cat_vars)
])

In [73]:
#creacion de una clase que prediga sobre un conjunto de datos

class Predictor(BaseEstimator, TransformerMixin):
    def __init__(self, model = None, y = None):
        self.model = model
        self.y = y
    def fit(self,X):
        return self
    def transform(self, X):
        self.model.fit(X, self.y)
        return self.model.predict(X)

In [74]:
rf4 = RandomForestRegressor()

pipeline_predictor = Pipeline([
    ("transform", mixed_transformer),
    ("predict", Predictor(model = rf4, y = housing_labels))
])

In [75]:
pipeline_predictor.fit_transform(housing)

array([266378.  , 328824.  , 224084.  , ..., 100937.  , 210656.  ,
       466197.74])

In [124]:
#forma eficiente
pipeline_predictor2 = Pipeline([
    ("transform", mixed_transformer),
    ("Random_Forest", RandomForestRegressor())
])

In [125]:
pipeline_predictor2.fit(housing, housing_labels)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('Numerical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('adding_attrs',
                                                                   Custom()),
                                                                  ('Standarizing',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                                 

In [126]:
pipeline_predictor2.predict(housing)

array([272979.  , 327320.04, 230563.  , ..., 105095.  , 210775.  ,
       435950.57])