In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

import matplotlib.pyplot as plt

from utils import preprocess

In [2]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
DATA_DIR = './data/'
IMG_DIR = './img/'
PROCESSED_DATA_DIR = './processed_data/'
TRAIN_FILE = DATA_DIR + 'train.csv'
TEST_FILE = DATA_DIR + 'train.csv'
PROCESSED_TRAIN_FILE = PROCESSED_DATA_DIR + 'processed_train.csv'
AUX_DATA_DIR = DATA_DIR + 'auxiliary-data/'
SUBZONE_FILE = AUX_DATA_DIR + 'sg-subzones.csv'

# Model Selection

In [4]:
df_train = pd.read_csv(PROCESSED_TRAIN_FILE)

In [5]:
df_X = df_train.iloc[:,0:8]
df_y = df_train.iloc[:,8]
X, y = df_X.to_numpy(), df_y.to_numpy()

# Split dataset in training and test data (20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

[[0.00000000e+00 1.00000000e+00 3.00000000e+00 ... 1.41439935e+00
  1.03837196e+02 5.68125814e+02]
 [0.00000000e+00 2.00000000e+00 4.00000000e+00 ... 1.37259680e+00
  1.03875625e+02 6.09170390e+02]
 [2.00000000e+00 5.00000000e+00 4.00000000e+00 ... 1.29877260e+00
  1.03895798e+02 2.66465939e+03]
 ...
 [2.00000000e+00 5.00000000e+00 4.00000000e+00 ... 1.31596110e+00
  1.03836848e+02 2.61727856e+03]
 [0.00000000e+00 4.00000000e+00 3.00000000e+00 ... 1.44075330e+00
  1.03806671e+02 5.40439333e+02]
 [2.00000000e+00 5.00000000e+00 4.00000000e+00 ... 1.31596110e+00
  1.03836848e+02 2.61727856e+03]]
[ 514500.  995400. 8485000. ... 4193700.  754800. 4178000.]


In [6]:
def exhaustive_search(name, estimator, parameters, n_jobs=-1):
    # define GridSearchCV
    rmse_scorer = make_scorer(mean_squared_error, squared=False)
    clf = GridSearchCV(estimator, parameters, scoring=rmse_scorer, verbose=4, n_jobs=n_jobs)
    model = clf.fit(X_train, y_train)

    # Store the parameters of the best model
    best_params = model.best_params_

    # Predict class labels of test data on the model with the best found parameters
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    # Calculate the loss
    best_loss = mean_squared_error(y_test, y_pred, squared=False)
    best_loss_train = mean_squared_error(y_train, y_pred_train, squared=False)

    print('Best {} regressor: {}'.format(name, best_params))
    print('RSME Loss: Train - {:.3f}, Test - {:.3f}'.format(best_loss_train, best_loss))

In [7]:
model_selection_dict = {
    'AdaBoost': {
        'estimator': AdaBoostRegressor(random_state=42),
        'parameters': {
            'base_estimator': [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=4), DecisionTreeRegressor(max_depth=5)],
            'n_estimators': [400, 600, 800, 1000],
            'learning_rate': [1e-1, 1e-2, 1e-3, 1e-4]
        }
    },
    'RandomForest': {
        'estimator': RandomForestRegressor(random_state=42, n_jobs=-1),
        'parameters': {
            'n_estimators': [100, 200, 300, 400, 1000],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 4, 6],
            'max_features': [0.2, 0.6, 1.0],
            'max_samples': [0.2, 0.6, 1.0]
        }
    },
    'ExtraTrees': {
        'estimator': ExtraTreesRegressor(bootstrap=True, random_state=42, n_jobs=-1),
        'parameters': {
            'n_estimators': [100, 200, 300, 400, 1000],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 4, 6],
            'max_features': [0.2, 0.6, 1.0],
            'max_samples': [0.2, 0.6, 1.0]
        }
    },
    'GradientBoosting': {
        'estimator': GradientBoostingRegressor(random_state=42),
        'parameters': {
            'n_estimators': [100, 200, 300, 400, 1000],
            'learning_rate': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
            'max_depth': [3, 4, 5],
            'max_features': [0.2, 0.6, 1.0]
        }
    },
    'DecisionTree': {
        'estimator': DecisionTreeRegressor(random_state=42),
        'parameters': {
            'criterion': ['squared_error', 'friedman_mse']
        }
    }
}

In [8]:
%%time

# for name, data in model_selection_dict.items():
#     exhaustive_search(name, data['estimator'], data['parameters'])

CPU times: total: 0 ns
Wall time: 0 ns
