# <center><span style='color:green'> Household Electricity Consumption </span></center>

## Problem Statement

* Predict the household electricity consumption
* Regression problem

## 1 <span style='color:red'>|</span> Import Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score , r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor , RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

#database
import pymongo

#save the model
import pickle

#to read data
import zipfile

## 2 <span style='color:red'>|</span> Load Dataset

In [None]:
zip_file_path = "cosumption_data.zip"
file_name = "household_power_consumption.txt"

with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    file_list = zip_file.namelist()
    for file in file_list:
        if file == file_name:
            with zip_file.open(file_name) as csv_file:
                df = pd.read_csv(csv_file, delimiter=';')

In [None]:
df.head()

In [None]:
df.shape

* Taking a sample of 60000 as data is too large 

In [None]:
df_sample = df.sample(60000)
df_sample.shape

## 3 <span style='color:red'>|</span> Exploratory Data Analysis

In [None]:
df_sample.columns

In [None]:
df_sample.drop(['Date','Time'], axis=1, inplace=True)

In [None]:
df_sample.head()

In [None]:
df_sample.sample(10)

In [None]:
df_sample.dtypes

#### 3.1 <span style='color:red'>|</span> Check any special character and handle them 

In [None]:
character_check = any(not value.isnumeric() for value in df_sample['Global_active_power'])
character_check

In [None]:
character = [value for value in df_sample['Global_active_power'] if not str(value).replace('.','',1).isnumeric()]
set(character)

In [None]:
#Check character in dataframe
special_char = df_sample[df_sample['Global_active_power']=='?']
special_char

* 737/60000 rows and 7/7 columns has special character in it

In [None]:
(737/60000)*100

In [None]:
#drop the data having special character
print("Data before removal of special character :", df_sample.shape)
df_sample.drop(special_char.index, axis=0, inplace=True)
print("Data after removal of special character :", df_sample.shape)

#### 3.2 <span style='color:red'>|</span> Check duplicates

In [None]:
df_sample.duplicated().sum()

* 265 duplicates record found

In [None]:
print("Data with duplicate records :", df_sample.shape)
df_sample.drop_duplicates(inplace=True)
print("Data withour duplicate records :", df_sample.shape)

#### 3.3 <span style='color:red'>|</span> Check null values

In [None]:
df_sample.isnull().sum()

* No null values in dataset

#### 3.4 <span style='color:red'>|</span> Convert dtype of columns

In [None]:
df_sample = df_sample.astype(float)

In [None]:
df_sample.dtypes

#### 3.5 <span style='color:red'>|</span> Combining Features

In [None]:
df_sample['meter'] = df_sample['Sub_metering_1'] + df_sample['Sub_metering_2'] + df_sample['Sub_metering_2']

In [None]:
df_sample.drop(['Sub_metering_1','Sub_metering_2','Sub_metering_3'], axis=1, inplace=True)

In [None]:
df_sample.columns

In [None]:
df_sample.head()

In [None]:
df_sample.describe().T

#### 3.6 <span style='color:red'>|</span> Data Distribution

In [None]:
#Data Distribution in each column
plt.figure(figsize=(20,20), facecolor='white')
plotnum =1

for col in df_sample.columns:
    if plotnum <= 5:
        ax = plt.subplot(3,2,plotnum)
        sns.distplot(df_sample[col], bins=30)
        plt.xlabel(col, fontsize=16)
    plotnum += 1
plt.show()

#### 3.7 <span style='color:red'>|</span> Check & Handle outliers

In [None]:
plt.figure(figsize=(20,10))
plotnum = 1

for col in df_sample.columns:
    if plotnum <= 5:
        ax = plt.subplot(2,3,plotnum)
        sns.boxplot(df_sample[col])
        plt.xlabel(col, fontsize=16)
    plotnum += 1
plt.show()

In [None]:
#Handling Outlier

df1 = df_sample.copy()

for col in df1.columns:
    iqr = df1[col].quantile(0.75) - df1[col].quantile(0.25)
    lower_limit = df1[col].quantile(0.25) - (1.5*iqr)
    upper_limit = df1[col].quantile(0.75) + (1.5*iqr)
    
    df1[col] = np.where(df1[col]>upper_limit,upper_limit,np.where(df1[col]<lower_limit,lower_limit,df1[col]))

In [None]:
#compare the distribution across quantile

for col in df1.columns:
    
    plt.figure(figsize=(16,4))
    plt.subplot(121)
    sns.boxplot(df_sample[col])
    plt.xlabel(f"Boxplot of {col} before outliers handling", fontsize=14, fontweight='bold')
    
    plt.subplot(122)
    sns.boxplot(df1[col])
    plt.xlabel(f"Boxplot of {col} after outliers handling", fontsize=14, fontweight='bold')

## 4 <span style='color:red'>|</span> Model Building

In [None]:
x= df1.drop('meter', axis=1)
y = df1['meter']

In [None]:
x.shape,y.shape

#### 4.1 <span style='color:red'>|</span> Train Test Split Dataset

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42, test_size=0.20)

#### 4.1 <span style='color:red'>|</span> Standardization

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

#### 4.3 <span style='color:red'>|</span> Modeling

In [None]:
report = []

In [None]:
models = {
    "Linear Regression" : LinearRegression(),
    "Ridge Regression" : Ridge(),
    "Lasso Regression" : Lasso(),
    "Support Vector Regression" : SVR(),
    "Decision Tree Regressor" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor()
}


In [None]:
for i in range(len(models)):
    model = list(models.values())[i] 
    model.fit(x_train,y_train)
    
    #prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    #Training data performance
    n = len(y_train)
    k = x_train.shape[1]
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_adj_r2 = 1-(((1-train_r2)*(n-1))/(n-k-1))
    
    #Test data performance
    n2 = len(y_test)
    k2 = x_test.shape[1]
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_adj_r2 = 1-(((1-test_r2)*(n2-1))/(n2-k2-1))
    
    report.append({"Model": list(models.keys())[i],
                 "Train MSE" : train_mse,
                 "Test MSE" : test_mse,
                 "Train MAE" : train_mae,
                 "Test MAE" : test_mae,
                 "Train R2" : train_r2,
                 "Test R2" : test_r2,
                 "Train Adj R2" : train_adj_r2,
                 "Test Adj R2" : test_adj_r2})

In [None]:
all_models = pd.DataFrame(report)
all_models

#### 4.4 <span style='color:red'>|</span> Hyperparameter Tunning

In [None]:
rand_params = {'max_depth':[5,8,10,15,None],
              'max_features':[3,'auto'],
              'min_samples_split':[2,8,15,20],
              'n_estimators':[50,100,200,500]}