In [2]:
import warnings
warnings.filterwarnings("ignore")

import pymongo

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline

### Data Ingestion

In [3]:
myclient = pymongo.MongoClient("mongodb+srv://satya:Mypass@cluster1.9frgf.mongodb.net/?retryWrites=true&w=majority", tls=True, tlsAllowInvalidCertificates=True)

mydb = myclient["Household_Power_Consumption_Database"]

collection = mydb["Household_Power_Consumption_Collection"]

In [4]:
mongodb_extract = collection.find()

In [5]:
df_mongodb = pd.DataFrame(mongodb_extract)

df_mongodb.head()

Unnamed: 0,_id,index,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Year,Month,Day,Total_meter_reading,power_consumption
0,636de553d16cf2c632ee0031,0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006.0,12,16,18.0,52.266667
1,636de553d16cf2c632ee0032,1,5.36,0.436,233.63,22.8,0.0,1.0,16.0,2006.0,12,16,17.0,62.533333
2,636de553d16cf2c632ee0033,2,5.374,0.498,233.29,22.8,0.0,2.0,17.0,2006.0,12,16,19.0,62.533333
3,636de553d16cf2c632ee0034,3,5.388,0.502,233.74,22.8,0.0,1.0,17.0,2006.0,12,16,18.0,62.533333
4,636de553d16cf2c632ee0035,4,3.666,0.505,235.68,15.8,0.0,1.0,17.0,2006.0,12,16,18.0,43.1


In [6]:
df_mongodb.drop(['_id', 'index'], axis = 1, inplace = True)

df_mongodb.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Year,Month,Day,Total_meter_reading,power_consumption
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006.0,12,16,18.0,52.266667
1,5.36,0.436,233.63,22.8,0.0,1.0,16.0,2006.0,12,16,17.0,62.533333
2,5.374,0.498,233.29,22.8,0.0,2.0,17.0,2006.0,12,16,19.0,62.533333
3,5.388,0.502,233.74,22.8,0.0,1.0,17.0,2006.0,12,16,18.0,62.533333
4,3.666,0.505,235.68,15.8,0.0,1.0,17.0,2006.0,12,16,18.0,43.1


##### Creating Dependent and Independent features

In [7]:
X = df_mongodb.drop("Total_meter_reading", axis =1)

y = df_mongodb["Total_meter_reading"]

In [8]:
X.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Year,Month,Day,power_consumption
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006.0,12,16,52.266667
1,5.36,0.436,233.63,22.8,0.0,1.0,16.0,2006.0,12,16,62.533333
2,5.374,0.498,233.29,22.8,0.0,2.0,17.0,2006.0,12,16,62.533333
3,5.388,0.502,233.74,22.8,0.0,1.0,17.0,2006.0,12,16,62.533333
4,3.666,0.505,235.68,15.8,0.0,1.0,17.0,2006.0,12,16,43.1


In [9]:
y.head()

0    18.0
1    17.0
2    19.0
3    18.0
4    18.0
Name: Total_meter_reading, dtype: float64

##### Train test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
print("Independent train dataset contains {} rows and {} columns.".format(X_train.shape[0],X_train.shape[1]))
print("Independent test dataset contains {} rows and {} columns.".format(X_test.shape[0],X_test.shape[1]))
print("Dependent train dataset contains {} rows.".format(y_train.shape[0]))
print("Dependent test dataset contains {} rows.".format(y_test.shape[0]))

Independent train dataset contains 33496 rows and 11 columns.
Independent test dataset contains 16499 rows and 11 columns.
Dependent train dataset contains 33496 rows.
Dependent test dataset contains 16499 rows.


###### Feature scaling

In [12]:
scaler = StandardScaler()

In [13]:
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

### Model Training

In [14]:
decision_tree_regressor_model = DecisionTreeRegressor(criterion='mse')

decision_tree_regressor_model.fit(X_train,y_train)

In [None]:
from sklearn import tree

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(30,15))

tree.plot_tree(decision_tree_regressor_model, filled = True, fontsize =10)

In [None]:
fig.savefig('Decision Tree Reg')

In [17]:
y_pred = decision_tree_regressor_model.predict(X_test)

In [18]:
def print_metrics(y_test,Prediction):
    print('Mean Squared Error is: ', mean_squared_error(y_test,y_pred))
    print('Mean Absolute Error is: ', mean_absolute_error(y_test,y_pred))
    print('Root Mean Squared Error is: ',np.sqrt(mean_squared_error(y_test,y_pred)))
    print('Accuracy is: {} %'.format(round((r2_score(y_test, y_pred))*100,3)))

In [19]:
print_metrics(y_test,y_pred)

Mean Squared Error is:  1.6515546396751317
Mean Absolute Error is:  0.15570640644887568
Root Mean Squared Error is:  1.2851282580642025
Accuracy is: 98.871 %


###### Hyperparameter Tuning 

In [18]:
param_grid={
    'max_depth':[2,4,6,8,10],
    'criterion':['mse', 'mae'],
#    'min_samples_split':[0.25, 0.5, 1.0],
#    'max_features':[0.25, 0.5, 1.0]
}

In [19]:
decision_tree_regressor_best_model = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid)

decision_tree_regressor_best_model.fit(X_train,y_train)

In [22]:
decision_tree_regressor_best_model.best_params_

{'criterion': 'mse', 'max_depth': 10}

In [20]:
Decision_Tree_Regressor_Best_Model = DecisionTreeRegressor(criterion='mse', max_depth=10)

In [21]:
Decision_Tree_Regressor_Best_Model.fit(X_train,y_train)

In [22]:
y_pred_tunned = Decision_Tree_Regressor_Best_Model.predict(X_test)

In [23]:
print_metrics(y_test,y_pred_tunned)

Mean Squared Error is:  1.6515546396751317
Mean Absolute Error is:  0.15570640644887568
Root Mean Squared Error is:  1.2851282580642025
Accuracy is: 98.871 %


# Thank You!