In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
import pandas as pd

file_path = 'Consumer_Dataset.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro,D
1,1,Female,38,Yes,3.0,Engineer,Yes,,Average,Hydro,A
2,2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar,B
3,3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar,B
4,4,Female,40,Yes,6.0,Entertainment,Yes,,High,Solar,A


In [3]:
data=data.dropna() 

In [4]:
energy_consumption_mapping = {
    'Low': 0,
    'Average': 100,
    'High': 200
}

data['Energy_Consumption'] = data['Energy_Consumption'].map(energy_consumption_mapping)

print("Unique values after encoding:", data['Energy_Consumption'].unique())

Unique values after encoding: [  0 200 100]


In [5]:
data_unsupervised = data.drop(['Group','Unnamed: 0'], axis=1)
data_unsupervised.head()

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable
0,Male,22,No,4.0,Healthcare,No,1.0,0,Hydro
2,Female,67,Yes,1.0,Engineer,Yes,1.0,0,Solar
3,Male,67,Yes,2.0,Lawyer,Yes,0.0,200,Solar
5,Male,56,Yes,2.0,Artist,No,0.0,100,Solar
6,Male,32,No,3.0,Healthcare,Yes,1.0,0,Solar


In [6]:
data_unsupervised = pd.get_dummies(data_unsupervised, columns=['Gender', 'Ever_Married', 'Profession', 'Graduated', 'Preferred_Renewable'])
data_unsupervised.head()

Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,Profession_Doctor,...,Profession_Marketing,Graduated_No,Graduated_Yes,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind
0,22,4.0,1.0,0,False,True,True,False,False,False,...,False,True,False,False,False,True,False,False,False,False
2,67,1.0,1.0,0,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,False
3,67,2.0,0.0,200,False,True,False,True,False,False,...,False,False,True,False,False,False,False,True,False,False
5,56,2.0,0.0,100,False,True,False,True,True,False,...,False,True,False,False,False,False,False,True,False,False
6,32,3.0,1.0,0,False,True,True,False,False,False,...,False,False,True,False,False,False,False,True,False,False


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#data_unsupervised[['Age', 'Family_Size', 'Work_Experience', 'Energy_Consumption']] = scaler.fit_transform(data[['Age', 'Family_Size', 'Work_Experience', 'Energy_Consumption']])

In [8]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
data_unsupervised['Cluster'] = kmeans.fit_predict(data_unsupervised)

# Check the counts in each cluster
print(data_unsupervised['Cluster'].value_counts())

Cluster
3    2944
1    1662
0    1055
2    1004
Name: count, dtype: int64


In [9]:
data_unsupervised['Cluster'] = kmeans.labels_
data_unsupervised['Group']=data['Group']
data_unsupervised.head()

Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,Profession_Doctor,...,Graduated_Yes,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Cluster,Group
0,22,4.0,1.0,0,False,True,True,False,False,False,...,False,False,False,True,False,False,False,False,3,D
2,67,1.0,1.0,0,True,False,False,True,False,False,...,True,False,False,False,False,True,False,False,0,B
3,67,2.0,0.0,200,False,True,False,True,False,False,...,True,False,False,False,False,True,False,False,2,B
5,56,2.0,0.0,100,False,True,False,True,True,False,...,False,False,False,False,False,True,False,False,1,C
6,32,3.0,1.0,0,False,True,True,False,False,False,...,True,False,False,False,False,True,False,False,3,C


In [10]:
from collections import defaultdict

counts = defaultdict(lambda: defaultdict(int))

for index, row in data_unsupervised.iterrows():
    cluster_value = row['Cluster']
    segment_value = row['Group']
    counts[cluster_value][segment_value] += 1

total_counts = {cluster: sum(counts[cluster].values()) for cluster in counts}

print("Counts and Percentages:")
for cluster in sorted(counts.keys()):
    print(f"Cluster {cluster}:")
    for segment in 'ABCD':
        count = counts[cluster][segment]
        percentage = (count / total_counts[cluster]) * 100 if total_counts[cluster] > 0 else 0
        print(f"  {segment}: {count} ({percentage:.2f}%)")


Counts and Percentages:
Cluster 0:
  A: 351 (33.27%)
  B: 322 (30.52%)
  C: 196 (18.58%)
  D: 186 (17.63%)
Cluster 1:
  A: 270 (16.25%)
  B: 499 (30.02%)
  C: 795 (47.83%)
  D: 98 (5.90%)
Cluster 2:
  A: 214 (21.31%)
  B: 315 (31.37%)
  C: 370 (36.85%)
  D: 105 (10.46%)
Cluster 3:
  A: 781 (26.53%)
  B: 436 (14.81%)
  C: 359 (12.19%)
  D: 1368 (46.47%)


In [11]:
cluster_map = {0: 'C', 1: 'A', 2: 'D', 3: 'B'}

data_unsupervised['Predicted_Group'] = data_unsupervised['Cluster'].map(cluster_map)
data_unsupervised.head()

Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,Profession_Doctor,...,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Cluster,Group,Predicted_Group
0,22,4.0,1.0,0,False,True,True,False,False,False,...,False,False,True,False,False,False,False,3,D,B
2,67,1.0,1.0,0,True,False,False,True,False,False,...,False,False,False,False,True,False,False,0,B,C
3,67,2.0,0.0,200,False,True,False,True,False,False,...,False,False,False,False,True,False,False,2,B,D
5,56,2.0,0.0,100,False,True,False,True,True,False,...,False,False,False,False,True,False,False,1,C,A
6,32,3.0,1.0,0,False,True,True,False,False,False,...,False,False,False,False,True,False,False,3,C,B


In [12]:
error = (data_unsupervised['Group'] != data_unsupervised['Predicted_Group']).sum()

percentage_error = (error / len(data_unsupervised)) * 100

print(f"Percentage Error: {percentage_error:.2f}%")

Percentage Error: 84.89%
