In [1]:
import pandas as pd
df = pd.read_pickle('df_spain.pkl')

In [2]:
df

Unnamed: 0,Comunidades y Ciudades Autónomas,Provincias,Periodo,NumCompraVentas,TotalPoblación,PrecioVivienda
31,01 Andalucía,04 Almería,2022M01,1220.0,723899.0,11.800358
32,01 Andalucía,11 Cádiz,2022M01,1417.0,1259339.0,11.800358
33,01 Andalucía,14 Córdoba,2022M01,675.0,777414.0,11.800358
34,01 Andalucía,18 Granada,2022M01,1204.0,929968.0,11.800358
35,01 Andalucía,21 Huelva,2022M01,688.0,532865.0,11.800358
...,...,...,...,...,...,...
1638,19 Melilla,52 Melilla,2009M01,33.0,73361.0,11.816417
1639,19 Melilla,52 Melilla,2008M07,71.0,72213.0,11.801842
1640,19 Melilla,52 Melilla,2008M01,79.0,71244.0,11.818585
1641,19 Melilla,52 Melilla,2007M07,83.0,70080.0,11.792366


### Pre-processing Pipeline
##### En esta sección, llevamos a cabo los pasos de preprocesamiento de datos para preparar los conjuntos de datos para la implementación del algoritmo de aprendizaje automático.
### Encoding
##### Los algoritmos de aprendizaje automático normalmente sólo pueden tener valores numéricos como variables de predicción. Por lo tanto, la codificación de etiquetas es necesaria, ya que codifican las etiquetas categóricas con valores numéricos. Para evitar la introducción de la importancia de las características categóricas con un gran número de valores únicos, utilizaremos tanto la codificación de etiquetas como la codificación de un solo valor, como se muestra a continuación.




In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object
le = LabelEncoder()

In [7]:
print(df.dtypes)
print(df.shape)
df.head()

Comunidades y Ciudades Autónomas     object
Provincias                           object
Periodo                              object
NumCompraVentas                     float64
TotalPoblación                      float64
PrecioVivienda                      float64
dtype: object
(1612, 6)


Unnamed: 0,Comunidades y Ciudades Autónomas,Provincias,Periodo,NumCompraVentas,TotalPoblación,PrecioVivienda
31,01 Andalucía,04 Almería,2022M01,1220.0,723899.0,11.800358
32,01 Andalucía,11 Cádiz,2022M01,1417.0,1259339.0,11.800358
33,01 Andalucía,14 Córdoba,2022M01,675.0,777414.0,11.800358
34,01 Andalucía,18 Granada,2022M01,1204.0,929968.0,11.800358
35,01 Andalucía,21 Huelva,2022M01,688.0,532865.0,11.800358


In [8]:
# La codificación de etiquetas se utilizará para las columnas con 5 o menos valores únicos
le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 10:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))

0 columns were label encoded.


In [9]:
# convertir el resto de la variable categórica en dummy
df = pd.get_dummies(df, drop_first=True)

In [10]:
print(df.dtypes)
print(df.shape)
df.head()

NumCompraVentas                                                float64
TotalPoblación                                                 float64
PrecioVivienda                                                 float64
Comunidades y Ciudades Autónomas_02 Aragón                       uint8
Comunidades y Ciudades Autónomas_03 Asturias, Principado de      uint8
                                                                ...   
Periodo_2020M01                                                  uint8
Periodo_2020M07                                                  uint8
Periodo_2021M01                                                  uint8
Periodo_2021M07                                                  uint8
Periodo_2022M01                                                  uint8
Length: 102, dtype: object
(1612, 102)


Unnamed: 0,NumCompraVentas,TotalPoblación,PrecioVivienda,Comunidades y Ciudades Autónomas_02 Aragón,"Comunidades y Ciudades Autónomas_03 Asturias, Principado de","Comunidades y Ciudades Autónomas_04 Balears, Illes",Comunidades y Ciudades Autónomas_05 Canarias,Comunidades y Ciudades Autónomas_06 Cantabria,Comunidades y Ciudades Autónomas_07 Castilla y León,Comunidades y Ciudades Autónomas_08 Castilla - La Mancha,...,Periodo_2017M07,Periodo_2018M01,Periodo_2018M07,Periodo_2019M01,Periodo_2019M07,Periodo_2020M01,Periodo_2020M07,Periodo_2021M01,Periodo_2021M07,Periodo_2022M01
31,1220.0,723899.0,11.800358,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32,1417.0,1259339.0,11.800358,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
33,675.0,777414.0,11.800358,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
34,1204.0,929968.0,11.800358,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35,688.0,532865.0,11.800358,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Feature Scaling
#El escalado de características utilizando MinMaxScaler esencialmente reduce el rango de tal manera que el rango está ahora entre 0 y n. Los algoritmos de aprendizaje automático funcionan mejor cuando las variables numéricas de entrada caen dentro de una escala similar. En este caso, estamos escalando entre 0 y 5.

In [11]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
df_col = list(df.columns)
df_col.remove('PrecioVivienda')
for col in df_col:
    df[col] = df[col].astype(float)
    df[[col]] = scaler.fit_transform(df[[col]])
df['PrecioVivienda'] = pd.to_numeric(df['PrecioVivienda'], downcast='float')
df.head()

Unnamed: 0,NumCompraVentas,TotalPoblación,PrecioVivienda,Comunidades y Ciudades Autónomas_02 Aragón,"Comunidades y Ciudades Autónomas_03 Asturias, Principado de","Comunidades y Ciudades Autónomas_04 Balears, Illes",Comunidades y Ciudades Autónomas_05 Canarias,Comunidades y Ciudades Autónomas_06 Cantabria,Comunidades y Ciudades Autónomas_07 Castilla y León,Comunidades y Ciudades Autónomas_08 Castilla - La Mancha,...,Periodo_2017M07,Periodo_2018M01,Periodo_2018M07,Periodo_2019M01,Periodo_2019M07,Periodo_2020M01,Periodo_2020M07,Periodo_2021M01,Periodo_2021M07,Periodo_2022M01
31,0.6989,0.488744,11.800358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
32,0.811755,0.888317,11.800358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
33,0.386687,0.52868,11.800358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
34,0.689734,0.642523,11.800358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
35,0.394134,0.346184,11.800358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [12]:
print(df.dtypes)
print('Size of Full Encoded Dataset: {}'. format(df.shape))

NumCompraVentas                                                float64
TotalPoblación                                                 float64
PrecioVivienda                                                 float32
Comunidades y Ciudades Autónomas_02 Aragón                     float64
Comunidades y Ciudades Autónomas_03 Asturias, Principado de    float64
                                                                ...   
Periodo_2020M01                                                float64
Periodo_2020M07                                                float64
Periodo_2021M01                                                float64
Periodo_2021M07                                                float64
Periodo_2022M01                                                float64
Length: 102, dtype: object
Size of Full Encoded Dataset: (1612, 102)


In [13]:
### Splitting data into training and testing sets

In [14]:
#Antes de implementar o aplicar cualquier algoritmo de aprendizaje automático, debemos desacoplar los conjuntos de datos de entrenamiento y de prueba de nuestro marco de datos principal.

In [15]:
# assign the target to a new dataframe and convert it to a numerical feature
target = df['PrecioVivienda'].copy()

In [16]:
type(target)

pandas.core.series.Series

In [17]:
# let's remove the target feature and redundant features from the dataset
df.drop(['PrecioVivienda'], axis=1, inplace=True)
print('Size of Full dataset is: {}'.format(df.shape))

Size of Full dataset is: (1612, 101)


In [18]:
from sklearn.model_selection import train_test_split
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    target,
                                                    test_size=0.25,
                                                    random_state=7)
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (1209, 101)
Number transactions y_train dataset:  (1209,)
Number transactions X_test dataset:  (403, 101)
Number transactions y_test dataset:  (403,)


# REGRESION LINEAL

In [19]:
from sklearn.linear_model import LinearRegression
#Model training
modelLinearRegression = LinearRegression()
modelLinearRegression.fit(X_train, y_train)

LinearRegression()

In [20]:
#Prediction of model
y_pred_lr = modelLinearRegression.predict(X_test)

In [21]:
#MODEL EVALUATION

In [22]:
from sklearn import metrics
MAE= metrics.mean_absolute_error(y_test, y_pred_lr)

In [23]:
MSE= metrics.mean_squared_error(y_test, y_pred_lr)

In [24]:
import numpy as np
RMSE= np.sqrt(MSE)

In [25]:
print (MAE, MSE, RMSE)

0.03863639926200469 0.002543274226026331 0.05043088563595063


In [26]:
#y_pred = model.predict(X_test)
R_squared_value = modelLinearRegression.score(X_test, y_test)

print(R_squared_value)

0.886492492052116


# RANDOM FOREST

In [27]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(bootstrap=False, max_features=30, n_estimators=300,
                      random_state=42)
#Antes de la mejora
#forest_reg = RandomForestRegressor(max_features=10, n_estimators=60, random_state=42)

forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_features=30, n_estimators=300,
                      random_state=42)

In [28]:
y_pred_rf = forest_reg.predict(X_test)

In [29]:
#MODEL EVALUATION

In [30]:
MAE_RF= metrics.mean_absolute_error(y_test, y_pred_rf)

In [31]:
MSE_RF= metrics.mean_squared_error(y_test, y_pred_rf)

In [32]:
RMSE_RF= np.sqrt(MSE_RF)

In [33]:
print (MAE_RF, MSE_RF, RMSE_RF)

0.02970551483684376 0.0024011377070017915 0.04900140515334016


In [34]:
R_squared_value = forest_reg.score(X_test, y_test)

print(R_squared_value)

0.8928361108006415


In [35]:
#Fine tune for your model -> BUSCAMOS MEJORES PARÁMETROS
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30, 60, 90, 150, 200, 250, 300, 350, 400], 'max_features': ['auto', 'sqrt', 'log2', 1, 2, 3, 4, 5, 10, 20, 30, 35]},
    ]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': ['auto', 'sqrt', 'log2', 1, 2, 3, 4,
                                           5, 10, 20, 30, 35],
                          'n_estimators': [3, 10, 30, 60, 90, 150, 200, 250,
                                           300, 350, 400]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [36]:
grid_search.best_params_

{'max_features': 30, 'n_estimators': 300}

In [37]:
grid_search.best_estimator_

RandomForestRegressor(max_features=30, n_estimators=300, random_state=42)

# SVR

In [38]:
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train,y_train)

SVR(kernel='linear')

In [39]:
y_pred_svr = svr_regressor.predict(X_test)

In [40]:
#MODEL EVALUATION

In [41]:
MAE_SVR= metrics.mean_absolute_error(y_test, y_pred_svr)

In [42]:
MSE_SVR= metrics.mean_squared_error(y_test, y_pred_svr)

In [43]:
RMSE_SVR= np.sqrt(MSE_SVR)

In [44]:
print (MAE_SVR, MSE_SVR, RMSE_SVR)

0.049977196614524946 0.0037728902385113713 0.06142385724220982


In [45]:
R_squared_value = svr_regressor.score(X_test, y_test)

print(R_squared_value)

0.8316141592786739


# SGD regressor

In [46]:
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
sgdr.fit(X_train,y_train)

SGDRegressor()

In [47]:
y_pred_sgdr = sgdr.predict(X_test)

In [48]:
#MODEL EVALUATION

In [49]:
MAE_SGDR= metrics.mean_absolute_error(y_test, y_pred_sgdr)

In [50]:
MSE_SGDR= metrics.mean_squared_error(y_test, y_pred_sgdr)

In [51]:
RMSE_SGDR= np.sqrt(MSE_SGDR)

In [52]:
print (MAE_SGDR, MSE_SGDR, RMSE_SGDR)

0.14342384505741473 0.08369626197104608 0.28930306249856064


In [53]:
R_squared_value = sgdr.score(X_test, y_test)

print(R_squared_value)

-2.7354029792257024


# XGBoost regressor

In [54]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)
xgb_model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, ...)

In [55]:
y_pred_xgb = xgb_model.predict(X_test)

In [56]:
#MODEL EVALUATION

In [57]:
MAE_xgb= metrics.mean_absolute_error(y_test, y_pred_xgb)

In [58]:
MSE_xgb= metrics.mean_squared_error(y_test, y_pred_xgb)

In [59]:
RMSE_xgb= np.sqrt(MSE_xgb)

In [60]:
print (MAE_xgb, MSE_xgb, RMSE_xgb)

0.028959684 0.0019375305 0.04401739


In [61]:
R_squared_value = xgb_model.score(X_test, y_test)

print(R_squared_value)

0.9135271209014462
