In [1]:
import pandas as pd
import numpy as np
from concat_stations import concat_files
import data_prep
from rf_optimize import evaluate
from scipy import stats

import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


In [2]:

# Load data and concatenate stations
full_data = concat_files()

# Replcae NaNs with medians
full_data = data_prep.nan_to_median(full_data)

# Get a dataframe with optimum features
full_data, df_features = data_prep.get_feature_df(full_data)

# Remove the outliers
full_data.apply(pd.to_numeric, errors='coerce')
new_full_data = data_prep.remove_outliers(full_data)

# Make target and feature arrays
y = np.array(new_full_data['bikes']) # array for target variable
X = new_full_data[df_features[0:-1]] # Features (discluding bikes)
X = np.array(X) # Turn into numpy array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 42)

"""
Now train the model
"""

# Instantiate model with optimised hyperparamters decision trees
rf = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf =        1, 
max_features = 'sqrt', max_depth = None, bootstrap = False)
# Train the model on training data
rf.fit(X_train, y_train)


"""
Evaluation
"""
y_pred = rf.predict(X_test) # Use the forest's predict method on the test data
print("mae {}".format(metrics.mean_absolute_error(y_test, y_pred)))
print("r2_error {}".format(metrics.r2_score(y_test, y_pred)))


mae 1.8558846838104481
r2_error 0.8047954939930295


In [5]:
# Save mopdel to local disk
import joblib
joblib.dump(rf, "rf_model.joblib")

# Then load with this...
# loaded_rf = joblib.load("rf_model.joblib")

['rf_model.joblib']