# Gathering data

In [None]:
import zipfile
from io import BytesIO
from urllib.request import urlopen
import glob
import os

In [None]:
def reading_file_data_from_zip_url(URL, FILENAME):
  data = BytesIO()
  with urlopen(URL) as zip_file_ref:
    with zipfile.ZipFile(BytesIO(zip_file_ref.read()), 'r') as zip_ref:
      data = zip_ref.read(FILENAME)
  return data

In [None]:
ZIP_FILE_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
RAW_FILENAME = "household_power_consumption.txt"
rawdata = reading_file_data_from_zip_url(ZIP_FILE_PATH, RAW_FILENAME)

FILENAME = 'temp.csv'
with open(FILENAME, 'w') as fptr:
  fptr.writelines(rawdata.decode())

# Data Ingestion

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(FILENAME, sep=';', na_values='?')

In [None]:
# DATA CLEANUP

os.remove(FILENAME)

In [None]:
df.shape

(2075259, 9)

In [None]:
df = df.sample(frac=0.05)

# EDA

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103763 entries, 1454623 to 1978057
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Date                   103763 non-null  object 
 1   Time                   103763 non-null  object 
 2   Global_active_power    102500 non-null  float64
 3   Global_reactive_power  102500 non-null  float64
 4   Voltage                102500 non-null  float64
 5   Global_intensity       102500 non-null  float64
 6   Sub_metering_1         102500 non-null  float64
 7   Sub_metering_2         102500 non-null  float64
 8   Sub_metering_3         102500 non-null  float64
dtypes: float64(7), object(2)
memory usage: 7.9+ MB


In [None]:
df.isna().sum()

Date                        0
Time                        0
Global_active_power      1263
Global_reactive_power    1263
Voltage                  1263
Global_intensity         1263
Sub_metering_1           1263
Sub_metering_2           1263
Sub_metering_3           1263
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

(102500, 9)

In [None]:
df['Total_Sub_metering'] = df['Sub_metering_1'] + df['Sub_metering_2'] + df['Sub_metering_3']

In [None]:
df.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Total_Sub_metering
count,102500.0,102500.0,102500.0,102500.0,102500.0,102500.0,102500.0,102500.0
mean,1.085921,0.123673,240.843245,4.604425,1.091766,1.274302,6.422498,8.788566
std,1.050701,0.112544,3.2465,4.416879,6.086809,5.719327,8.419594,12.784714
min,0.078,0.0,224.24,0.2,0.0,0.0,0.0,0.0
25%,0.308,0.048,238.99,1.4,0.0,0.0,0.0,0.0
50%,0.596,0.1,241.01,2.6,0.0,0.0,1.0,1.0
75%,1.526,0.194,242.89,6.4,0.0,1.0,17.0,18.0
max,9.482,1.13,254.15,41.2,84.0,76.0,31.0,127.0


In [None]:
df.corr()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Total_Sub_metering
Global_active_power,1.0,0.249362,-0.398465,0.99887,0.483174,0.433916,0.64104,0.846323
Global_reactive_power,0.249362,1.0,-0.113019,0.268565,0.130598,0.139466,0.087428,0.182146
Voltage,-0.398465,-0.113019,1.0,-0.410149,-0.195222,-0.163648,-0.270423,-0.344246
Global_intensity,0.99887,0.268565,-0.410149,1.0,0.488328,0.439572,0.628857,0.843283
Sub_metering_1,0.483174,0.130598,-0.195222,0.488328,1.0,0.057042,0.105098,0.570833
Sub_metering_2,0.433916,0.139466,-0.163648,0.439572,0.057042,1.0,0.083644,0.5296
Sub_metering_3,0.64104,0.087428,-0.270423,0.628857,0.105098,0.083644,1.0,0.746023
Total_Sub_metering,0.846323,0.182146,-0.344246,0.843283,0.570833,0.5296,0.746023,1.0


In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df['Month'] = df['Date'].dt.month

In [None]:
df['Year'] = df['Date'].dt.year

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102500 entries, 1454623 to 1978057
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Date                   102500 non-null  datetime64[ns]
 1   Time                   102500 non-null  object        
 2   Global_active_power    102500 non-null  float64       
 3   Global_reactive_power  102500 non-null  float64       
 4   Voltage                102500 non-null  float64       
 5   Global_intensity       102500 non-null  float64       
 6   Sub_metering_1         102500 non-null  float64       
 7   Sub_metering_2         102500 non-null  float64       
 8   Sub_metering_3         102500 non-null  float64       
 9   Total_Sub_metering     102500 non-null  float64       
 10  Year                   102500 non-null  int64         
 11  Month                  102500 non-null  int64         
dtypes: datetime64[ns](1), float64(8), int

In [None]:
df.isna().sum()

Date                     0
Time                     0
Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
Total_Sub_metering       0
Year                     0
Month                    0
dtype: int64

In [None]:
df.duplicated().sum()

0

# Feature engineering

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train_test = df.sample(frac=0.2)

In [None]:
df_train_test.columns

Index(['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3', 'Total_Sub_metering', 'Year', 'Month'],
      dtype='object')

In [None]:
independent_feature = ['Year', 'Month', 'Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']
dependent_feature = ['Total_Sub_metering']

In [None]:
X = df_train_test.loc[:, independent_feature]

In [None]:
y = df_train_test.loc[:, dependent_feature]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
print("X:", X_train.shape, X_test.shape)
print("y:", y_train.shape, y_test.shape)

X: (16400, 6) (4100, 6)
y: (16400, 1) (4100, 1)


# Bagging Regression

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor

In [None]:
b_g = BaggingRegressor(base_estimator=SVR(),n_estimators=10, random_state=0)

In [None]:
b_g.fit(X_train, y_train)

  return column_or_1d(y, warn=True)


BaggingRegressor(base_estimator=SVR(), random_state=0)

In [None]:
b_g.score(X_test, y_test)

-0.34368867942758996

In [None]:
y_predict_b_g = b_g.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
print("MSE: %.2f" % mean_squared_error(y_test, y_predict_b_g))
print("MAE: %.2f" % mean_absolute_error(y_test, y_predict_b_g))

MSE: 232.42
MAE: 8.60


# Extra Trees Regression

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
e_t_r = ExtraTreesRegressor(n_estimators=10, random_state=0)

In [None]:
e_t_r.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


ExtraTreesRegressor(n_estimators=10, random_state=0)

In [None]:
e_t_r.score(X_test, y_test)

0.7660526543428826

In [None]:
y_predict_e_t_r = e_t_r.predict(X_test)

In [None]:
print("MSE: %.2f" % mean_squared_error(y_test, y_predict_e_t_r))
print("MAE: %.2f" % mean_absolute_error(y_test, y_predict_e_t_r))

MSE: 40.47
MAE: 3.10


# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
r_f_r = RandomForestRegressor(n_estimators=10, random_state=0)

In [None]:
r_f_r.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(n_estimators=10, random_state=0)

In [None]:
r_f_r.score(X_test, y_test)

0.7717904033524223

In [None]:
y_predict_r_f_r = r_f_r.predict(X_test)

In [None]:
print("MSE: %.2f" % mean_squared_error(y_test, y_predict_r_f_r))
print("MAE: %.2f" % mean_absolute_error(y_test, y_predict_r_f_r))

MSE: 39.47
MAE: 3.08


# Stacking Regressor

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR

In [None]:
estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=42))
]

In [None]:
s_r = StackingRegressor(
    estimators=estimators, 
    final_estimator=RandomForestRegressor(n_estimators=10, random_state=42)
)

In [None]:
s_r.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


StackingRegressor(estimators=[('lr', RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                              ('svr', LinearSVR(random_state=42))],
                  final_estimator=RandomForestRegressor(n_estimators=10,
                                                        random_state=42))

In [None]:
s_r.score(X_test, y_test)

0.6970710895803988

In [None]:
y_predict_s_r = s_r.predict(X_test)

In [None]:
print("MSE: %.2f" % mean_squared_error(y_test, y_predict_s_r))
print("MAE: %.2f" % mean_absolute_error(y_test, y_predict_s_r))

MSE: 52.40
MAE: 4.06


# Voting Regression

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet

In [None]:
v_estimators = [
    ('er', ElasticNet()),
    ('lr', SVR()),
    ('rfr', RandomForestRegressor(n_estimators=10, random_state=1))
]

In [None]:
v_r = VotingRegressor(v_estimators)

In [None]:
v_r.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


VotingRegressor(estimators=[('er', ElasticNet()), ('lr', SVR()),
                            ('rfr',
                             RandomForestRegressor(n_estimators=10,
                                                   random_state=1))])

In [None]:
v_r.score(X_test, y_test)

0.6360585718881523

In [None]:
y_predict_v_r = v_r.predict(X_test)

In [None]:
print("MSE: %.2f" % mean_squared_error(y_test, y_predict_v_r))
print("MAE: %.2f" % mean_absolute_error(y_test, y_predict_v_r))

MSE: 62.95
MAE: 4.76


**==========THE END==========**