In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from joblib import dump

In [2]:
X_train_all_indices = np.load('../data/X_train_all_indices.npy')
y_train = np.load('../data/y_train.npy')

# Model Training

In [9]:
model_all_indices = ExtraTreesRegressor()

param_grid = {
    'n_estimators': [250, 500, 750],
    'max_depth': [10, 20, None],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 20],
    'max_features': [0.5, 0.75, 1.0]
}

grid_search = GridSearchCV(model_all_indices, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_all_indices, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [10]:
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': 20, 'max_features': 1.0, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 500}


['../models/random_forest.joblib']

In [3]:
model_all_indices = ExtraTreesRegressor()

param_grid = {
    'n_estimators': [500],
    'max_depth': [20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': [0.9, 1.0]
}

grid_search = GridSearchCV(model_all_indices, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_all_indices, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits




In [4]:
print(f"Best parameters: {grid_search.best_params_}")
model_all_indices = grid_search.best_estimator_
model_all_indices.fit(X_train_all_indices, y_train)
dump(model_all_indices, '../models/extra_trees.joblib')

Best parameters: {'max_depth': 20, 'max_features': 1.0, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 500}


['../models/extra_trees.joblib']

In [6]:
X_test_all_indices = np.load('../data/X_test_all_indices.npy')
y_test = np.load('../data/y_test.npy')

In [7]:
y_pred_all_indices = model_all_indices.predict(X_test_all_indices)

In [8]:
results = pd.DataFrame(columns=['Model', 'MAE'])
results.loc[0] = ['RF', mean_absolute_error(y_test, y_pred_all_indices)]
results

Unnamed: 0,Model,MAE
0,RF,4.686991
