<a href="https://colab.research.google.com/github/Pushkarp26/Machine-Learning-of-energy-use-of-appliances-in-alow-energy-house/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import math
from google.colab import files
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

#**Data**
From the previous notebooks I have saved the predictor and target feature datasets.
So, will use them directly here instead of doing all the preliminary steps.

In [None]:
X = pd.read_csv("predictor.csv")
y = pd.read_csv("target.csv")
X.drop("Unnamed: 0",axis=1,inplace=True)
y.drop("Unnamed: 0",axis=1,inplace=True)

print("Predictor features:\n {}\n\nTarget Features:\n {}".format(X.head(),y.head()))

Predictor features:
    Kitchen_Temp  Kitchen_Humidity  Living_room_Temp  ...        rv2  Weekday    NSM
0         19.89         47.596667              19.2  ...  13.275433        0  61200
1         19.89         46.693333              19.2  ...  18.606195        0  61800
2         19.89         46.300000              19.2  ...  28.642668        0  62400
3         19.89         46.066667              19.2  ...  45.410389        0  63000
4         19.89         46.333333              19.2  ...  10.084097        0  63600

[5 rows x 28 columns]

Target Features:
    Total_Energy_Consumption
0                        90
1                        90
2                        80
3                        90
4                       100


Splitting the data into Training and Testing sets.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

y_array = np.asarray(y)
y_array = y_array.ravel()

X_train,X_test,y_train,y_test = train_test_split(X,y_array,test_size=0.3)
print("X_train:",X_train.shape,"\t","y_train:",y_train.shape,"\n",'X_test:',
       X_test.shape,"\t","y_test:",y_test.shape)

X_train: (13814, 28) 	 y_train: (13814,) 
 X_test: (5921, 28) 	 y_test: (5921,)


#**Base Model**



In [None]:
rfc = RandomForestRegressor(n_estimators=10,random_state=42)
rfc.fit(X_train,y_train)                                                         #training the model

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

Checking the *Accuracy* score of our model

In [None]:
print("Accuracy Score for Train data: {}\n Accuracy Score for Test data: {}".
      format(rfc.score(X_train,y_train),rfc.score(X_test,y_test)))                                                      #

Accuracy Score for Train data: 0.914981091806844
 Accuracy Score for Test data: 0.5188051358559009


Evaluating errors

In [None]:
from sklearn import metrics
pred = rfc.predict(X_test)
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))   #mean absolute error
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))     #mean squared error
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(
    y_test, pred)))                                                              #Root Mean Squared Error 
# mape = np.mean(np.abs((y_test - pred) / np.abs(y_test)))
# print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
# print('Accuracy:', round(100*(1 - mape), 2))

Mean Absolute Error (MAE): 32.10103023137984
Mean Squared Error (MSE): 4261.902837358555
Root Mean Squared Error (RMSE): 65.28325081794377


#**RandomSearchCV**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 110, num = 22)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = 
                               random_grid, n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 51.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [None]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 95,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 90}

In [None]:
rf_random.cv_results_

#**Evaluation Function**

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

#####**Evaluate the Default Model**


In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 34.5386 degrees.
Accuracy = 65.66%.


#####**Evaluate the Best Random Search Model**


In [None]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 29.2471 degrees.
Accuracy = 71.56%.


In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 8.98%.


#**Grid Search**


We can now perform grid search building on the result from the random search. We will test a range of hyperparameters around the best values returned by random search.

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [100,110,120,130,140,150],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1, 2, 4, 6],
    'n_estimators': [80, 100, 110, 120]
}

# Create a base model
rf = RandomForestRegressor(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 39.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed: 53.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [28]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 120}

#####**Evaluate the Best Model from Grid Search**


In [29]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
Average Error: 29.0743 degrees.
Accuracy = 71.64%.


In [31]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) /
                                        base_accuracy))


Improvement of 9.10%.


#**Conclusion**
Chronologically we performed RandomSearchCV first on the model, where we the error was MAE was significantly reduced from 34.586 to 29.247 and ACCURACY increased dramtically from 65.66% to 71.56%.

Then we performed GridsearchCV from the around the best parameters obtained from RandomSearchCV.Then the MAE was slightly reduced from 29.247 to 29.073 and ACCURACY achieved was 71.64%.

So from the above results it is evident that furter tuning won't make much difference in the ACCURACY and thus we concluded our quest of obtainig the best model.