In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

data_ori = pd.read_csv('../../data/train.csv')
data = data_ori.iloc[:10000]

In [2]:
#data.count()

In [3]:
features = data.columns.drop(['precio','id', 'descripcion', 'titulo', 'fecha', 'direccion'])
data = data.fillna(0)
for col in ['ciudad', 'provincia', 'tipodepropiedad']:
    data[col] = data[col].astype('category')
    data[col] = data[col].cat.codes
X = pd.DataFrame(data, columns = features)
y = data.precio
np.random.seed(seed = 42) #Sirve para darme los mismos sets de entrenamiento y de testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

In [4]:
from sklearn.ensemble import RandomForestRegressor

In [5]:
rfr = RandomForestRegressor(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)
rfr.fit(X_train, y_train)
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rfr.score(X_train, y_train),  
                                                                                             rfr.oob_score_,
                                                                                             rfr.score(X_test, y_test)))

R^2 Training Score: 0.95 
OOB Score: 0.61 
R^2 Validation Score: 0.58


# Tuneo de hyperparametros

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [2,3,4, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [8]:
rf = RandomForestRegressor()
random_search = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 5,
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.5min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [9]:
best_random = random_search.best_estimator_
best_random

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features='log2', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=3,
                      min_weight_fraction_leaf=0.0, n_estimators=800,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [10]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [3, 4, 5, None],
    'max_features': [2, 3, 'log2'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 4, 8, 10],
    'n_estimators': [50, 100, 200, 300, 500, 800, 1000, 1300, 1500]
}# Create a based model
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf,
                           param_grid = param_grid, 
                           cv = 3, 
                           n_jobs = -1, 
                           verbose = 2)

In [11]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 25.9min
[Parallel(n_jobs=-1)]: Done 3888 out of 3888 | elapsed: 33.2min finished


{'bootstrap': True,
 'max_depth': None,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1500}

In [12]:
best_grid = grid_search.best_estimator_
best_grid

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='log2', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1500,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [13]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [14]:
grid_accuracy = evaluate(best_grid, X_test, y_test)
grid_accuracy

Model Performance
Average Error: 833403.8584 degrees.
Accuracy = 56.47%.


56.46681384260736

In [15]:
random_accuracy = evaluate(best_random, X_test, y_test)
random_accuracy

Model Performance
Average Error: 822765.3927 degrees.
Accuracy = 58.19%.


58.18617854797514

In [18]:
if(grid_accuracy >= random_accuracy): final_params = grid_search.best_params_
else: final_params = random_search.best_params_
final_params

{'n_estimators': 800,
 'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': None,
 'bootstrap': False}

In [21]:
rf_final = RandomForestRegressor(n_estimators = 800,
                                 min_samples_split = 3,
                                 min_samples_leaf = 2,
                                 max_features = 'log2',
                                 max_depth = None,
                                 bootstrap = False)
rf_final.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features='log2', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=3,
                      min_weight_fraction_leaf=0.0, n_estimators=800,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [22]:
feature_importances = pd.DataFrame(rf_final.feature_importances_, 
                                   index = X_train.columns, 
                                   columns = ['importance']).sort_values('importance', ascending=False)

In [23]:
feature_importances

Unnamed: 0,importance
metroscubiertos,0.323971
banos,0.11031
metrostotales,0.109964
idzona,0.062831
ciudad,0.061967
provincia,0.061694
tipodepropiedad,0.059874
garages,0.047263
habitaciones,0.036147
antiguedad,0.034008
