## We will work on our data in this section

In [14]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gdown #to access the google drive file

In [15]:
# Import dataset 
df = pd.read_csv(r"E:\Projects\ML Projects\Electrecity_Bill_Prediction_EndToEnd_Projects\data\electricity_bill_dataset.csv")
df.head()

Unnamed: 0,Fan,Refrigerator,AirConditioner,Television,Monitor,MotorPump,Month,City,Company,MonthlyHours,TariffRate,ElectricityBill
0,16,23.0,2.0,6.0,1.0,0,10,Hyderabad,Tata Power Company Ltd.,384,8.4,3225.6
1,19,22.0,2.0,3.0,1.0,0,5,Vadodara,NHPC,488,7.8,3806.4
2,7,20.0,2.0,6.0,7.0,0,7,Shimla,Jyoti Structure,416,7.7,3203.2
3,7,22.0,3.0,21.0,1.0,0,6,Mumbai,Power Grid Corp,475,9.2,4370.0
4,11,23.0,2.0,11.0,1.0,0,2,Mumbai,Ratnagiri Gas and Power Pvt. Ltd. (RGPPL),457,9.2,4204.4


### Exploratory Data Analysis

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45345 entries, 0 to 45344
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Fan              45345 non-null  int64  
 1   Refrigerator     45345 non-null  float64
 2   AirConditioner   45345 non-null  float64
 3   Television       45345 non-null  float64
 4   Monitor          45345 non-null  float64
 5   MotorPump        45345 non-null  int64  
 6   Month            45345 non-null  int64  
 7   City             45345 non-null  object 
 8   Company          45345 non-null  object 
 9   MonthlyHours     45345 non-null  int64  
 10  TariffRate       45345 non-null  float64
 11  ElectricityBill  45345 non-null  float64
dtypes: float64(6), int64(4), object(2)
memory usage: 4.2+ MB


In [17]:
# Dictionary for renaming columns
column_names = {
    'AirConditioner': 'Air Conditioner',
    'MotorPump': 'Motor Pump',
    'MonthlyHours': 'Monthly Hours',
    'TariffRate': 'Tariff Rate',
    'ElectricityBill': 'Electricity Bill',

}

# Rename columns using the dictionary
df = df.rename(columns=column_names)

# Display
df.head()

Unnamed: 0,Fan,Refrigerator,Air Conditioner,Television,Monitor,Motor Pump,Month,City,Company,Monthly Hours,Tariff Rate,Electricity Bill
0,16,23.0,2.0,6.0,1.0,0,10,Hyderabad,Tata Power Company Ltd.,384,8.4,3225.6
1,19,22.0,2.0,3.0,1.0,0,5,Vadodara,NHPC,488,7.8,3806.4
2,7,20.0,2.0,6.0,7.0,0,7,Shimla,Jyoti Structure,416,7.7,3203.2
3,7,22.0,3.0,21.0,1.0,0,6,Mumbai,Power Grid Corp,475,9.2,4370.0
4,11,23.0,2.0,11.0,1.0,0,2,Mumbai,Ratnagiri Gas and Power Pvt. Ltd. (RGPPL),457,9.2,4204.4


In [18]:
# df.to_csv('Home Electricity Bill Prediction.csv', index=False)

In [19]:
# Loop through each column and print the count of categorical values
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column data type is object (categorical)
        print(f"Column '{col}' categorical values count:")
        print(df[col].value_counts())
        print('===========================================')

Column 'City' categorical values count:
City
Hyderabad      2914
Navi Mumbai    2914
Ratnagiri      2892
Faridabad      2891
Gurgaon        2878
Ahmedabad      2868
New Delhi      2848
Mumbai         2846
Chennai        2834
Dahej          2821
Nagpur         2816
Noida          2793
Pune           2788
Shimla         2780
Kolkata        2740
Vadodara       2722
Name: count, dtype: int64
Column 'Company' categorical values count:
Company
Ringfeder Power Transmission India Pvt. Ltd.                          1486
JSW Energy Ltd.                                                       1477
Guj Ind Power                                                         1476
SJVN Ltd.                                                             1471
Maha Transco – Maharashtra State Electricity Transmission Co, Ltd.    1470
NTPC Pvt. Ltd.                                                        1466
Optibelt Power Transmission India Private Limited                     1463
Kalpataru Power                  

So, we have categorical values, thus we need to convert this with encoding technique

### One hot Encoding 

In [20]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of columns to encode
categorical_columns = ['City', 'Company']

# Create a copy of the DataFrame with the selected columns
data = df.copy()

# Create an instance of OneHotEncoder
# sparse=False to produce a dense array and drop='first' to drop the first category of each variable
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Iterate through each categorical column

for column in categorical_columns:
    # Fit and transform the selected column
    one_hot_encoded = encoder.fit_transform(data[[column]])

    # Create a DataFrame with one-hot encoded columns
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out([column]))

    # Concatenate the one-hot encoded DataFrame with the original DataFrame
    data = pd.concat([data, one_hot_df], axis=1)

    # Drop the original categorical column
    data = data.drop([column], axis=1)

# Display the resulting DataFrame
df = data.copy()

df.head()

Unnamed: 0,Fan,Refrigerator,Air Conditioner,Television,Monitor,Motor Pump,Month,Monthly Hours,Tariff Rate,Electricity Bill,...,Company_Reliance Power,Company_Ringfeder Power Transmission India Pvt. Ltd.,Company_SJVN Ltd.,Company_Sterlite Power Transmission Ltd,Company_Sunil Hitech Eng,Company_Tata Power Company Ltd.,Company_Torrent Power Ltd.,Company_Toshiba Transmission & Distribution Systems (India) Pvt. Ltd.,Company_TransRail Lighting,Company_Unitech Power Transmission Ltd.
0,16,23.0,2.0,6.0,1.0,0,10,384,8.4,3225.6,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,19,22.0,2.0,3.0,1.0,0,5,488,7.8,3806.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,20.0,2.0,6.0,7.0,0,7,416,7.7,3203.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,22.0,3.0,21.0,1.0,0,6,475,9.2,4370.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,23.0,2.0,11.0,1.0,0,2,457,9.2,4204.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45345 entries, 0 to 45344
Data columns (total 56 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   Fan                                                                         45345 non-null  int64  
 1   Refrigerator                                                                45345 non-null  float64
 2   Air Conditioner                                                             45345 non-null  float64
 3   Television                                                                  45345 non-null  float64
 4   Monitor                                                                     45345 non-null  float64
 5   Motor Pump                                                                  45345 non-null  int64  
 6   Month                                         

## Import Necessary libraries for building the model 

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost.sklearn import XGBRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Evaluation 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [23]:
# Define the features and target variable 

# drop null values
df = df.dropna()

# Features 
X = df.drop(columns={'Electricity Bill'})  # Our target variable is "Electricity Bill" so, except this all other column is our independent variables.

# Target variable 
y = df['Electricity Bill']

In [24]:
# Training set and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

To make our model more robust and efficient we will standardized our training and testing data so that we can Avoid Bias,Improve Convergence

In [25]:
# Initialize the standard scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

## Lets fit into model

### XGBRegressor

In [26]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    # Initialize the XGBRegressor model
    xgb_model = XGBRegressor()

    # Fit the model on the entire training data
    xgb_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = xgb_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

# Assuming we have defined X_train_scaled, y_train, X_test_scaled, and y_test
mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 8.862415354353605
R2: 0.9996626012816993
MAPE: 0.24498406860118285
MSE: 384.43850651625604
RMSE: 19.607103470840766


#### Interpretations: 
* Mean Absolute Error (MAE): 8.862415354353605

    * MAE represents the average absolute difference between the actual and predicted values. In this case, the average absolute difference between the actual and predicted electricity bills is approximately $8.86.

* R-squared (R2): 0.9996626012816993

    * R2 score is a measure of how well the model explains the variability of the target variable. It ranges from 0 to 1, where 1 indicates a perfect fit. Here, the model has a very high R2 score of approximately 0.9997, indicating that it explains about 99.97% of the variance in the electricity bill.

* Mean Absolute Percentage Error (MAPE): 0.24498406860118285%

    * MAPE measures the average percentage difference between the actual and predicted values relative to the actual values. In this case, the average percentage difference is approximately 0.245%.

* Mean Squared Error (MSE): 384.43850651625604

    * MSE represents the average of the squares of the errors between the actual and predicted values. It is a measure of the average squared difference between the estimated values and the actual value.

* Root Mean Squared Error (RMSE): 19.607103470840766

    * RMSE is the square root of the MSE, which provides an absolute measure of fit. It represents the standard deviation of the residuals (prediction errors). Here, the RMSE is approximately 19.61, indicating that the average difference between the actual and predicted electricity bills is around $19.61.

* Overall, these metrics suggest that the XGBoost model has performed exceptionally well on the testing data, demonstrating high accuracy and predictive power. However, it's essential to consider the context of your specific problem domain and any additional business requirements when interpreting these results.

### CatBoostRegressor

In [27]:
def model(X_train_scaled, y_train, X_test_scaled, y_test):
   
    ct_model = CatBoostRegressor(verbose=0)

    # Fit the model on the entire training data
    ct_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = ct_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 7.55330228523507
R2: 0.9997555699306967
MAPE: 0.20738048954744792
MSE: 278.5082624612946
RMSE: 16.688566818672435


#### Interpretations:

* Mean Absolute Error (MAE): 7.55330228523507

    * The MAE indicates that, on average, the model's predictions are approximately $7.55 away from the actual electricity bill values.

* R-squared (R2): 0.9997555699306967

    * The high R-squared value of approximately 0.9998 suggests that the model explains about 99.98% of the variance in the electricity bill, indicating an excellent fit to the data.

* Mean Absolute Percentage Error (MAPE): 0.20738048954744792%

    * The MAPE is low, indicating that, on average, the model's predictions deviate by only about 0.21% from the actual values.

* Mean Squared Error (MSE): 278.5082624612946

    * The MSE measures the average squared difference between the model's predictions and the actual values, with a value of approximately 278.51.

* Root Mean Squared Error (RMSE): 16.688566818672435

    * The RMSE, which is the square root of the MSE, is approximately 16.69. It represents the average magnitude of the errors in the model's predictions, suggesting that the typical error in predicting the electricity bill is around $16.69.

Overall, these metrics indicate that the CatBoostRegressor model performs exceptionally well in predicting electricity bills based on the provided features.

### RandomForestRegressor

In [28]:
def model(X_train_scaled, y_train, X_test_scaled, y_test):
   
    rf_model = RandomForestRegressor()

    # Fit the model on the entire training data
    rf_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = rf_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test


mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 1.3606772521785389
R2: 0.999973959988074
MAPE: 0.03933892607043768
MSE: 29.670484063733998
RMSE: 5.447061966210224


#### Interpretations:

The RandomForestRegressor model has demonstrated outstanding performance on the testing data:

* Mean Absolute Error (MAE): 1.3606772521785389

    * The MAE of approximately 1.36 indicates that, on average, the model's predictions are only about $1.36 away from the actual electricity bill values.

* R-squared (R2): 0.999973959988074

    * The exceptionally high R-squared value of approximately 0.99997 suggests that the model explains about 99.997% of the variance in the electricity bill, indicating an exceptional fit to the data.

* Mean Absolute Percentage Error (MAPE): 0.03933892607043768%

    * The MAPE is very low, indicating that, on average, the model's predictions deviate by only about 0.039% from the actual values.

* Mean Squared Error (MSE): 29.670484063733998

    * The MSE, which measures the average squared difference between the model's predictions and the actual values, is approximately 29.67.

* Root Mean Squared Error (RMSE): 5.447061966210224

    * The RMSE, the square root of the MSE, is approximately 5.45. This suggests that the typical error in predicting the electricity bill is around $5.45.

These metrics collectively indicate that the RandomForestRegressor model performs exceptionally well in predicting electricity bills based on the provided features. The model's predictions are very close to the actual values, and it explains the variance in the target variable almost perfectly.

### DecisionTreeRegressor

In [29]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    
    dt_model = DecisionTreeRegressor()

    # Fit the model on the entire training data
    dt_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = dt_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

# Example usage:
# Assuming you have defined X_train_scaled, y_train, X_test_scaled, and y_test
mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 1.8840445473592142
R2: 0.9999080093110466
MAPE: 0.05514304215579295
MSE: 104.8159378101226
RMSE: 10.237965511278235


#### Interpretations:
The DecisionTreeRegressor model has shown strong performance on the testing data:

* Mean Absolute Error (MAE): 1.8840445473592142

    * The MAE of approximately 1.88 indicates that, on average, the model's predictions are about $1.88 away from the actual electricity bill values.

* R-squared (R2): 0.9999080093110466

    * The high R-squared value of approximately 0.99991 suggests that the model explains about 99.991% of the variance in the electricity bill, indicating an excellent fit to the data.

* Mean Absolute Percentage Error (MAPE): 0.05514304215579295%

    * The MAPE is low, indicating that, on average, the model's predictions deviate by only about 0.055% from the actual values.

* Mean Squared Error (MSE): 104.8159378101226

    * The MSE, which measures the average squared difference between the model's predictions and the actual values, is approximately 104.82.

* Root Mean Squared Error (RMSE): 10.237965511278235

    * The RMSE, the square root of the MSE, is approximately 10.24. This suggests that the typical error in predicting the electricity bill is around $10.24.

Overall, these metrics indicate that the DecisionTreeRegressor model performs well in predicting electricity bills based on the provided features. The model's predictions are reasonably close to the actual values, and it explains the variance in the target variable effectively. However, compared to some other models like RandomForestRegressor, the DecisionTreeRegressor might have slightly higher errors.

### LGBMRegressor

In [30]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    
    lgb_model = LGBMRegressor()

    # Fit the model on the entire training data
    lgb_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = lgb_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test


mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 481
[LightGBM] [Info] Number of data points in the train set: 36276, number of used features: 54
[LightGBM] [Info] Start training from score 4311.513883
MAE: 10.002900324896984
R2: 0.9997520174906965
MAPE: 0.2745591758882116
MSE: 282.55598005502765
RMSE: 16.80940153768205


#### Interpretations:
The LGBMRegressor model has performed well on the testing data:

* Mean Absolute Error (MAE): 10.002900324896984

    * The MAE of approximately 10.00 indicates that, on average, the model's predictions are about $10.00 away from the actual electricity bill values.

* R-squared (R2): 0.9997520174906965

    * The high R-squared value of approximately 0.99975 suggests that the model explains about 99.975% of the variance in the electricity bill, indicating an excellent fit to the data.

* Mean Absolute Percentage Error (MAPE): 0.2745591758882116%

    * The MAPE is relatively low, indicating that, on average, the model's predictions deviate by only about 0.27% from the actual values.

* Mean Squared Error (MSE): 282.55598005502765

    * The MSE, which measures the average squared difference between the model's predictions and the actual values, is approximately 282.56.

* Root Mean Squared Error (RMSE): 16.80940153768205

    * The RMSE, the square root of the MSE, is approximately 16.81. This suggests that the typical error in predicting the electricity bill is around $16.81.

Overall, these metrics indicate that the LGBMRegressor model performs well in predicting electricity bills based on the provided features. However, compared to some other models like RandomForestRegressor, the LGBMRegressor model may have slightly higher errors.

# Summary Interpretaions

In summary, RandomForestRegressor stands out as the top performer among the models, followed by DecisionTreeRegressor and LGBMRegressor. These models demonstrate high accuracy and explain a significant portion of the variance in the electricity bill. However, when considering computational efficiency and ease of use, RandomForestRegressor may be preferred due to its excellent performance and relatively low computational cost.