## Importing Libraries
* **pandas**: For data manipulation and analysis
* **sklearn.cluster**: For clustering algorithms 
* **sklearn.model_selection**: For splitting the dataset into training and testing sets 
* **sklearn.ensemble**: For building ensemble models 
* **scipy.cluster.hierarchy**: For hierarchical clustering algorithms 
* **matplotlib.pyplot**: For data visualization 
* **seaborn**: For creating attractive and informative statistical graphics 
* **random**: For generating random numbers 
* **numpy**: For numerical computing 

In [2]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from scipy.spatial.distance import cdist
import random as rd
import numpy as np
import math
import matplotlib.pyplot as plt
import os


## Reading in the input data

In [3]:
# Load the data
try:
    data = pd.read_csv("dataset.csv")
except FileNotFoundError:
    print("Could not find the input file. Please check the file path and try again.")
    exit()

## Viewing the first few rows of the data

In [4]:
data.head()

Unnamed: 0,device_name,min_consumption,max_consumption,device_age
0,bulb,100.0,120.0,3
1,LED TV 42 INCH,58.0,60.0,5
2,3''belt sander,1000.0,1000.0,4
3,Air cooler,65.0,80.0,4
4,Air purifier,25.0,30.0,3


## Getting values for input_power, working_hours, and output_power

This code cell generates values for the `input_power`, `working_hours`, and `output_power` columns of the `data` DataFrame based on the `min_consumption` and `max_consumption` columns.


In [5]:
min_c = data['min_consumption'].values
max_c = data['max_consumption'].values
random_consumption = [rd.uniform(min_c[i], max_c[i]) for i in range(len(min_c))]
working_hours = []
for i in range(len(min_c)):
    random_hours = rd.randint(0,24)
    working_hours.append(random_hours)
data['input_power'] = random_consumption
data['working_hours'] = working_hours
output_power = []
for i in range(len(min_c)):
    output_power.append(rd.uniform(random_consumption[i] * 0.70 , random_consumption[i]))
    output_power[i] = output_power[i]
data['output_power'] = output_power 
data.head()

Unnamed: 0,device_name,min_consumption,max_consumption,device_age,input_power,working_hours,output_power
0,bulb,100.0,120.0,3,100.310967,3,89.648287
1,LED TV 42 INCH,58.0,60.0,5,58.737271,10,56.690422
2,3''belt sander,1000.0,1000.0,4,1000.0,8,734.753019
3,Air cooler,65.0,80.0,4,70.680072,22,53.410833
4,Air purifier,25.0,30.0,3,29.24276,4,23.135054


## Calculating efficiency metrics

This code cell calculates the `Enhanced_efficiency` and `efficiency` columns of the `data` DataFrame based on the `input_power`, `output_power`, and `min_consumption` columns.

In [6]:
Enhanced_efficiency = []
for i in range(len(min_c)):
    Enhanced_efficiency.append(output_power[i]/random_consumption[i]*100)
data['Enhanced_efficiency'] = Enhanced_efficiency
efficiency = data['output_power']/data['min_consumption']*100
data['efficiency'] = efficiency
data.head()


Unnamed: 0,device_name,min_consumption,max_consumption,device_age,input_power,working_hours,output_power,Enhanced_efficiency,efficiency
0,bulb,100.0,120.0,3,100.310967,3,89.648287,89.370374,89.648287
1,LED TV 42 INCH,58.0,60.0,5,58.737271,10,56.690422,96.515247,97.742107
2,3''belt sander,1000.0,1000.0,4,1000.0,8,734.753019,73.475302,73.475302
3,Air cooler,65.0,80.0,4,70.680072,22,53.410833,75.567032,82.170512
4,Air purifier,25.0,30.0,3,29.24276,4,23.135054,79.113784,92.540217


## Handling values

In [7]:
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna()
data.head()

Unnamed: 0,device_name,min_consumption,max_consumption,device_age,input_power,working_hours,output_power,Enhanced_efficiency,efficiency
0,bulb,100.0,120.0,3,100.310967,3,89.648287,89.370374,89.648287
1,LED TV 42 INCH,58.0,60.0,5,58.737271,10,56.690422,96.515247,97.742107
2,3''belt sander,1000.0,1000.0,4,1000.0,8,734.753019,73.475302,73.475302
3,Air cooler,65.0,80.0,4,70.680072,22,53.410833,75.567032,82.170512
4,Air purifier,25.0,30.0,3,29.24276,4,23.135054,79.113784,92.540217


## Adjusting efficiency values



In [8]:
List_efficiency = data['efficiency'].values
List_enhan_efficiency = data['Enhanced_efficiency'].values

for i in range(len(List_efficiency)):
    # if enhance efficiency is greater than enhance efficiency then minus abs value for efficiency
    if List_enhan_efficiency[i] > List_efficiency[i]:
        List_efficiency[i] = List_efficiency[i] - abs(List_enhan_efficiency[i] - List_efficiency[i])
    # if enhance efficiency is less than enhance efficiency then plus abs value for efficiency
    elif List_enhan_efficiency[i] < List_efficiency[i]:
        List_efficiency[i] = List_efficiency[i] + abs(List_enhan_efficiency[i] - List_efficiency[i])
    # if enhance efficiency is equal to enhance efficiency then plus abs value for efficiency
    else:
        List_efficiency[i] = List_efficiency[i] + abs(List_enhan_efficiency[i] - List_efficiency[i])
data['efficiency'] = List_efficiency
data['efficiency'] = data['efficiency'].apply(lambda x: 200 if x > 200 else x)
data['efficiency'] = data['efficiency'].apply(lambda x: 0 if x < 0 else x)
data['Enhanced_efficiency'] = data['Enhanced_efficiency'].apply(lambda x: 200 if x > 200 else x)
data['Enhanced_efficiency'] = data['Enhanced_efficiency'].apply(lambda x: 0 if x < 0 else x)
data.head()


Unnamed: 0,device_name,min_consumption,max_consumption,device_age,input_power,working_hours,output_power,Enhanced_efficiency,efficiency
0,bulb,100.0,120.0,3,100.310967,3,89.648287,89.370374,89.926199
1,LED TV 42 INCH,58.0,60.0,5,58.737271,10,56.690422,96.515247,98.968967
2,3''belt sander,1000.0,1000.0,4,1000.0,8,734.753019,73.475302,73.475302
3,Air cooler,65.0,80.0,4,70.680072,22,53.410833,75.567032,88.773992
4,Air purifier,25.0,30.0,3,29.24276,4,23.135054,79.113784,105.96665


## Training a GradientBoostingRegressor model

In [9]:
X = data[['min_consumption', 'max_consumption', 'input_power']]
y = data['efficiency']
Normal = DecisionTreeRegressor()
Normal.fit(X, y)
Normal.score(X, y)
print(Normal.feature_importances_)


[0.37857567 0.430308   0.19111633]


In [10]:
#feature to predict enhanced efficiency with K-centroid
X = data[['min_consumption', 'max_consumption', 'input_power']]
y = data['Enhanced_efficiency']


In [17]:
def find_centroid(cluster):
  """
  Finds the centroid of the given cluster.

  Args:
    cluster: A list of data points.

  Returns:
    The centroid of the cluster.
  """
  # Calculate the sum of all the data points in the cluster.
  sum_of_points = sum(cluster)

  # Calculate the number of data points in the cluster.
  num_points = len(cluster)

  # The centroid is the sum of all the data points divided by the number of data points.
  centroid = sum_of_points / num_points

  return centroid

## k-centroid hierarchical clustering


In [18]:
def hierarchical_clustering(dataset, k):
  """
  Performs hierarchical clustering on the given dataset.

  Args:
    dataset: A list of data points.
    k: The number of clusters to create.

  Returns:
    A list of clusters.
  """

  # Initialize the clusters.
  clusters = [[x] for x in dataset]

  # Iterate over all pairs of clusters.
  for m in range(len(dataset) - 1):
    # Find the closest pair of clusters.
    min_distance = float('inf')
    merge_clusters = ()
    for i in range(len(clusters)):
      for j in range(i + 1, len(clusters)):
        # Calculate the distance between the two clusters.
        dist_1 = []
        dist_2 = []

        for obj in clusters[i]:
          dist_1.append(np.linalg.norm(obj - find_centroid(clusters[i])))
        for obj in clusters[j]:
          dist_2.append(np.linalg.norm(obj - find_centroid(clusters[j])))

        dist_1 = sorted(dist_1)[:k]
        dist_2 = sorted(dist_2)[:k]

        neighbors = []
        for obj1 in dist_1:
          for obj2 in dist_2:
            # Average the distances between the two objects.
            average_distance = (obj1 + obj2) / (min(k, len(dist_1)) * min(k, len(dist_2)))
            neighbors.append(average_distance)

        if average_distance < min_distance:
          min_distance = average_distance
          # The closest pair of clusters are Cu and Cv.
          merge_clusters = (i, j)

    # Merge the two closest clusters.
    merged_cluster = clusters[merge_clusters[0]] + clusters[merge_clusters[1]]
    clusters = [c for idx, c in enumerate(clusters) if idx not in merge_clusters]
    clusters.append(merged_cluster)

  # Return the list of clusters.
  return clusters


In [19]:
# Run the hierarchical clustering algorithm
clusters = hierarchical_clustering(X.values, 3)
print(clusters)


In [14]:
# Print the clusters
for i, cluster in enumerate(clusters):
    print('Cluster {}:'.format(i + 1))
    for point in cluster:
        print(point)
    print()

Cluster 1:
[ 7500.        10500.         9339.1045945]
[220.         270.         237.12151145]
[ 8000.         14000.         10909.78969797]
[2000.         3000.         2921.56217768]
[3250.         9750.         8530.85850494]
[240.        860.        833.8015235]
[6600.         8800.         8013.21057106]
[1300.         1500.         1459.81032213]
[1500.         4500.         3061.55584743]
[ 3800.        11400.        10477.9152073]
[3000.        7500.        5888.2604151]
[ 50.         100.          83.49107754]
[ 500.         1800.          927.78879434]
[5400.        6750.        6433.4039994]
[1800.         5000.         2527.14524625]
[100. 100. 100.]
[ 4.         34.          7.29195546]
[1800.         5000.         1840.39971614]
[ 875.         2300.         1473.97247046]
[4700.        4500.        4513.6672402]
[ 50.         100.          73.53862466]
[1000.         4000.         3485.16987503]
[14.        15.        14.9269791]
[3000.         3002.5        3000.448246

## Gradient Boosting Regressor Model with Enhanced Efficiency


In [15]:
# Print the centroids for Enhanced efficiency
for i, cluster in enumerate(clusters):
    print('Centroid {}:'.format(i + 1))
    print(find_centroid(cluster))
    print()
K_centroid = DecisionTreeRegressor()
K_centroid.fit(X, y)
K_centroid.score(X, y)
print(K_centroid.feature_importances_)

Centroid 1:
[549.50982609 855.70405686 706.77567386]

[0.22715328 0.32934793 0.44349879]


## Test input

In [16]:

test_input = [[300, 600, 400]]
test_output = Normal.predict(test_input)
print("Normal efficiency: ", test_output)
test_output = K_centroid.predict(test_input)
print("Enhanced efficiency: ", test_output)


Normal efficiency:  [128.84168386]
Enhanced efficiency:  [91.79915006]




## Saving models

In [28]:
import pickle
try:
    os.mkdir('pretrained_model')
except:
    pass
pickle.dump(Normal, open('pretrained_model/Normal.pkl', 'wb'))
pickle.dump(K_centroid, open('pretrained_model/K_centroid.pkl', 'wb'))
print("Model saved successfully")

Model saved successfully
