In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
dataframe = pd.read_csv("csv/82000278_Toamnei_PM10_2021_12.csv")
dataframe.head()

Unnamed: 0,time,latitude,longitude,altitude,pm10
0,1638309618,45.651464,25.615426,538,9
1,1638309678,45.651464,25.615426,538,9
2,1638309738,45.651464,25.615426,538,10
3,1638309798,45.651464,25.615426,538,9
4,1638309858,45.651464,25.615426,538,9


In [8]:
X = dataframe.select_dtypes(include="number").drop("pm10", axis=1)
y = dataframe.pm10

X_test = dataframe.select_dtypes(include="number")

X_test.head()

Unnamed: 0,time,latitude,longitude,altitude,pm10
0,1638309618,45.651464,25.615426,538,9
1,1638309678,45.651464,25.615426,538,9
2,1638309738,45.651464,25.615426,538,10
3,1638309798,45.651464,25.615426,538,9
4,1638309858,45.651464,25.615426,538,9


In [13]:
from sklearn.impute import SimpleImputer

# Impute both train and test sets
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)
X_test = imputer.fit_transform(X_test)

In [14]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

# Fit a base model
forest = RandomForestRegressor()

_ = forest.fit(X_train, y_train)

print(f"R2 for training set: {forest.score(X_train, y_train)}")
print(f"R2 for validation set: {forest.score(X_valid, y_valid)}\n")


R2 for training set: 0.9955765080737962
R2 for validation set: 0.9865479139310879

CPU times: total: 4.88 s
Wall time: 5.26 s


In [17]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [21]:
n_estimators = np.arange(100, 2000, step=100)
max_features = ["auto", "sqrt", "log2"]
max_depth = list(np.arange(10, 100, step=10)) + [None]
min_samples_split = np.arange(2, 10, step=2)
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

param_grid

{'n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
        1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900]),
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, None],
 'min_samples_split': array([2, 4, 6, 8]),
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [22]:
from sklearn.model_selection import RandomizedSearchCV

forest = RandomForestRegressor()

random_cv = RandomizedSearchCV(
    forest, param_grid, n_iter=100, cv=3, scoring="r2", n_jobs=-1
)

In [23]:
%%time

_ = random_cv.fit(X, y)

print("Best params:\n")
print(random_cv.best_params_)

random_cv.best_score_

Best params:

{'n_estimators': 1400, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}
CPU times: total: 1min 4s
Wall time: 46min 6s


-2.0277033209013915

In [24]:
n_iterations = 1

for value in param_grid.values():
    n_iterations *= len(value)
    
n_iterations

13680

In [1]:
new_params = {
    "n_estimators": [1300, 1400, 1500],
    "max_features": ['auto'],
    "max_depth": [15, 20, 25],
    "min_samples_split": [4, 6, 8],
    "min_samples_leaf": [5, 6, 7],
    "bootstrap": [True],
}

new_params

{'n_estimators': [1300, 1400, 1500],
 'max_features': ['auto'],
 'max_depth': [15, 20, 25],
 'min_samples_split': [4, 6, 8],
 'min_samples_leaf': [5, 6, 7],
 'bootstrap': [True]}

In [35]:
n_iterations = 1

for value in new_params.values():
    n_iterations *= len(value)
    
n_iterations

81

In [36]:
from sklearn.model_selection import GridSearchCV

forest = RandomForestRegressor()

grid_cv = GridSearchCV(forest, new_params, n_jobs=-1)

In [37]:
%%time

_ = grid_cv.fit(X, y)

print('Best params:\n')
print(grid_cv.best_params_, '\n')

Best params:

{'bootstrap': True, 'max_depth': 25, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 4, 'n_estimators': 1500} 

CPU times: total: 1min 6s
Wall time: 1h 20min 1s


In [38]:
grid_cv.best_score_

-0.5925383200748817