In [3]:
import pandas as pd
import numpy as np
from concat_stations import concat_files
from data_prep import get_param_dist
from rf_optimize import evaluate

import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import RandomizedSearchCV


In [6]:
# insert and prepare the data

# Prepare the data

full_data = concat_files()
for i in full_data.columns:
    full_data = full_data[full_data[i].notna()]
full_data = pd.get_dummies(full_data) # Turn weekday into 1-hot encoding
feature_cols = list(full_data.drop('bikes', axis = 1).columns)

y = np.array(full_data['bikes']) # array for target variable
X = full_data[feature_cols] # Features
X = np.array(X) # Turn into numpy array

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)


(42600, 30)
(42600,)


In [9]:
# Get model parameters
random_grid = get_param_dist()

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Search for the optimum hyperparameters - n_iter is the number of different combinations we want to try, n_iter is the number of fods for cross validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Train the model on training data
rf_random.fit(X_train, y_train)

print(rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 49.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 124.9min finished
{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [10]:
best_random = rf_random.best_estimator_
mae, r2_score = evaluate(best_random, X_test, y_test)

print("r2 error with best paramters {}".format(r2_score))

r2 error with best paramters 0.8126948026732219


In [11]:
print(rf_random.best_params_)
best_params = rf_random.best_params_
with open('output_metrics.txt', 'a') as file:
     file.write(json.dumps(best_params)) # use `json.loads` to do the reverse

{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [12]:

data = {"model_type": "rf_best_params", "r2_score": r2_score, "mae": mae} 
with open('output_metrics.txt', 'a') as file:
     file.write(json.dumps(data)) # use `json.loads` to do the reverse

In [13]:
with open('output_metrics.txt', 'a') as file:
     file.write(json.dumps(best_params)) # use `json.loads` to do the reverse


In [14]:
print(best_random)

RandomForestRegressor(bootstrap=False, max_features='sqrt', n_estimators=400,
                      random_state=42)
