In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from boruta import BorutaPy
import matplotlib.pyplot as plt


# Read data

In [None]:
train_data = pd.read_csv('./training.csv')
test_data = pd.read_csv('./testing.csv')

x_train = train_data.values[:,2:30]
y_train = train_data['Appliances'].values

x_test = test_data.values[:,2:30]
y_test = test_data['Appliances'].values

# Use BorutaPy to do feature engineering

In [None]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=100)

In [None]:
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=1, random_state=1)

In [None]:
feat_selector.fit(x_train,y_train)

In [None]:
# view the ranking of all features (ranking = 1 represents for those selected features)
feat_selector.ranking_

In [None]:
# number of selected features -> auto determined by the selector
feat_selector.n_features_

In [None]:
# transform the raw input X
x_transformed = feat_selector.transform(x_train)
test_transformed = feat_selector.transform(x_test)

# Use transformed input X to train a new model

In [None]:
rf_train = RandomForestRegressor(n_jobs=-1, max_depth=100)
rf_train.fit(x_transformed,y_train)

In [None]:
y_pred = rf_train.predict(test_transformed)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(np.arange(len(y_pred)), np.sort(y_pred), color='blue', linewidth=1)

plt.plot(np.arange(len(y_test)), np.sort(y_test), color='red', linewidth=1)

plt.xticks(())
plt.yticks(())

plt.show()

# Calculate MAE, MAPE, R2 and RMS

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def cal_errors(y_test,y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rms = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print('MAE = {}, RMS = {}, R2 = {}, MAPE = {}'.format(mae,rms,r2,mape))

cal_errors(y_test, y_pred)