Loading the Data

In [1]:
import pandas as pd

file_path = 'Consumer_Dataset.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Group
0,0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro,D
1,1,Female,38,Yes,3.0,Engineer,Yes,,Average,Hydro,A
2,2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar,B
3,3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar,B
4,4,Female,40,Yes,6.0,Entertainment,Yes,,High,Solar,A


Pre-processing the Data

In [2]:
data=data.dropna() 

In [3]:
print("Unique values before encoding:", data['Energy_Consumption'].unique())

Unique values before encoding: ['Low' 'High' 'Average']


In [4]:
energy_consumption_mapping = {
    'Low': 0,
    'Average': 100,
    'High': 200
}

data['Energy_Consumption'] = data['Energy_Consumption'].map(energy_consumption_mapping)

print("Unique values after encoding:", data['Energy_Consumption'].unique())

Unique values after encoding: [  0 200 100]


In [5]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(data[categorical_columns])
one_hot_data = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([data, one_hot_data], axis=1)
df_encoded = df_encoded.drop(categorical_columns, axis=1)
df_encoded.columns

Index(['Unnamed: 0', 'Age', 'Family_Size', 'Work_Experience',
       'Energy_Consumption', 'Gender_Female', 'Gender_Male', 'Ever_Married_No',
       'Ever_Married_Yes', 'Profession_Artist', 'Profession_Doctor',
       'Profession_Engineer', 'Profession_Entertainment',
       'Profession_Executive', 'Profession_Healthcare', 'Profession_Homemaker',
       'Profession_Lawyer', 'Profession_Marketing', 'Graduated_No',
       'Graduated_Yes', 'Preferred_Renewable_Biomass',
       'Preferred_Renewable_Geothermal', 'Preferred_Renewable_Hydro',
       'Preferred_Renewable_Not Interested', 'Preferred_Renewable_Solar',
       'Preferred_Renewable_Tidal', 'Preferred_Renewable_Wind', 'Group_A',
       'Group_B', 'Group_C', 'Group_D'],
      dtype='object')

Data Processing

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.optimizers import Adam

if df_encoded.isnull().values.any():
    df_encoded = df_encoded.fillna(df_encoded.mean())

# Drop the index and separate features and labels
X = df_encoded.drop(columns=['Group_A', 'Group_B', 'Group_C', 'Group_D', 'Unnamed: 0'])
y = df_encoded[['Group_A', 'Group_B', 'Group_C', 'Group_D']]

# Convert labels to numpy arrays
X=X.values
y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model along with Hyperparameter Tuning

In [7]:
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

  from kerastuner.tuners import RandomSearch


In [8]:
def build_model(hp):
    model = Sequential()
    model.add(InputLayer(input_shape=(X_train.shape[1],)))
    model.add(Dense(units=hp.Int('units1', min_value=32, max_value=512, step=32), activation='relu'))
    model.add(Dense(units=hp.Int('units2', min_value=32, max_value=512, step=32), activation='relu'))
    model.add(Dense(4, activation='softmax'))  
    optimizer = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [9]:
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',  
    max_trials=10, 
    directory='my_dir',  
    project_name='my_project' 
)

Reloading Tuner from my_dir\my_project\tuner0.json


In [10]:
tuner.search(X_train, y_train,epochs=50,validation_split=0.2)

best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

In [11]:
final_model = tuner.hypermodel.build(best_hp)
final_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)



Epoch 1/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.2821 - loss: 2.4235 - val_accuracy: 0.2720 - val_loss: 1.4913
Epoch 2/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3264 - loss: 1.4835 - val_accuracy: 0.3240 - val_loss: 1.4232
Epoch 3/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3747 - loss: 1.4332 - val_accuracy: 0.4432 - val_loss: 1.2569
Epoch 4/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4005 - loss: 1.3264 - val_accuracy: 0.3752 - val_loss: 1.3432
Epoch 5/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3945 - loss: 1.3584 - val_accuracy: 0.5240 - val_loss: 1.2510
Epoch 6/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3995 - loss: 1.2979 - val_accuracy: 0.3704 - val_loss: 1.2971
Epoch 7/50
[1m157/157[0m 

<keras.src.callbacks.history.History at 0x243ffad7750>

In [12]:
loss, accuracy = final_model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4037 - loss: 1.2244
Test accuracy: 0.40754958987236023


__Assigning Labels to the Test Data__

Pre-Processing the Test Data

In [13]:
file_path_test = 'Consumer_Test_Dataset.csv'
data_test = pd.read_csv(file_path_test)

In [14]:
data_test.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable
0,0,Female,36,Yes,1.0,Engineer,Yes,0.0,Low,Solar
1,1,Male,37,Yes,4.0,Healthcare,Yes,8.0,Average,Solar
2,2,Female,69,Yes,1.0,,No,0.0,Low,Solar
3,3,Male,59,Yes,2.0,Executive,No,11.0,High,Solar
4,4,Female,19,No,4.0,Marketing,No,,Low,Solar


In [15]:
data_test=data_test.dropna()

In [16]:
print("Unique values before encoding:", data_test['Energy_Consumption'].unique())
data_test['Energy_Consumption'] = data_test['Energy_Consumption'].map(energy_consumption_mapping)
print("Unique values after encoding:", data_test['Energy_Consumption'].unique())

Unique values before encoding: ['Low' 'Average' 'High']
Unique values after encoding: [  0 100 200]


In [17]:
categorical_columns_test = data_test.select_dtypes(include=['object']).columns.tolist()
one_hot_encoded_test = encoder.fit_transform(data_test[categorical_columns_test])
one_hot_data_test = pd.DataFrame(one_hot_encoded_test, columns=encoder.get_feature_names_out(categorical_columns_test))
df_encoded_test = pd.concat([data_test, one_hot_data_test], axis=1)
df_encoded_test= df_encoded_test.drop(categorical_columns_test, axis=1)
df_encoded_test.columns

Index(['Unnamed: 0', 'Age', 'Family_Size', 'Work_Experience',
       'Energy_Consumption', 'Gender_Female', 'Gender_Male', 'Ever_Married_No',
       'Ever_Married_Yes', 'Profession_Artist', 'Profession_Doctor',
       'Profession_Engineer', 'Profession_Entertainment',
       'Profession_Executive', 'Profession_Healthcare', 'Profession_Homemaker',
       'Profession_Lawyer', 'Profession_Marketing', 'Graduated_No',
       'Graduated_Yes', 'Preferred_Renewable_Biomass',
       'Preferred_Renewable_Geothermal', 'Preferred_Renewable_Hydro',
       'Preferred_Renewable_Not Interested', 'Preferred_Renewable_Solar',
       'Preferred_Renewable_Tidal', 'Preferred_Renewable_Wind'],
      dtype='object')

In [18]:
if df_encoded_test.isnull().values.any():
    df_encoded_test = df_encoded_test.fillna(df_encoded_test.mean())

X_predict = df_encoded_test.drop(columns=['Unnamed: 0'])
X_predict=X_predict.values

In [19]:
class_mapping = {
    1: 'A',
    2: 'B',
    3: 'C',
    4: 'D'
}

y_pred_prob = final_model.predict(X_predict)

y_pred = np.argmax(y_pred_prob, axis=1)

y_pred_labels = [class_mapping[class_num + 1] for class_num in y_pred]

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Adding the Predicted Labels

In [20]:
df_encoded_test['Group']=y_pred_labels
df_encoded_test.head()

Unnamed: 0.1,Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,...,Graduated_No,Graduated_Yes,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Group
0,0.0,36.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,B
1,1.0,37.0,4.0,8.0,100.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,D
3,3.0,59.0,2.0,11.0,200.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,A
5,5.0,47.0,5.0,0.0,200.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,A
6,6.0,61.0,3.0,5.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C


__K Means Clustering__

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [22]:
data_unsupervised = df_encoded_test.drop(['Group'], axis=1)  # Drop the 'Group' column


In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#data_unsupervised[['Age', 'Family_Size', 'Work_Experience', 'Energy_Consumption']] = scaler.fit_transform(data_unsupervised[['Age', 'Family_Size', 'Work_Experience', 'Energy_Consumption']])

In [24]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
data_unsupervised['Cluster'] = kmeans.fit_predict(data_unsupervised)

# Check the counts in each cluster
print(data_unsupervised['Cluster'].value_counts())

Cluster
0    947
2    656
3    479
1    459
Name: count, dtype: int64


In [25]:
data_unsupervised['Cluster'] = kmeans.labels_
data_unsupervised['Group']=df_encoded_test['Group']
data_unsupervised.head()

Unnamed: 0.1,Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,...,Graduated_Yes,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Cluster,Group
0,0.0,36.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,B
1,1.0,37.0,4.0,8.0,100.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,D
3,3.0,59.0,2.0,11.0,200.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2,A
5,5.0,47.0,5.0,0.0,200.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,A
6,6.0,61.0,3.0,5.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,C


In [26]:
from collections import defaultdict

# Initialize a nested dictionary to hold counts
counts = defaultdict(lambda: defaultdict(int))

# Iterate over rows in the DataFrame
for index, row in data_unsupervised.iterrows():
    cluster_value = row['Cluster']
    segment_value = row['Group']
    counts[cluster_value][segment_value] += 1

# Compute total counts for each cluster
total_counts = {cluster: sum(counts[cluster].values()) for cluster in counts}

# Print counts and percentages for each combination
print("Counts and Percentages:")
for cluster in sorted(counts.keys()):
    print(f"Cluster {cluster}:")
    for segment in 'ABCD':
        count = counts[cluster][segment]
        percentage = (count / total_counts[cluster]) * 100 if total_counts[cluster] > 0 else 0
        print(f"  {segment}: {count} ({percentage:.2f}%)")


Counts and Percentages:
Cluster 0:
  A: 385 (40.65%)
  B: 64 (6.76%)
  C: 222 (23.44%)
  D: 276 (29.14%)
Cluster 1:
  A: 364 (79.30%)
  B: 47 (10.24%)
  C: 18 (3.92%)
  D: 30 (6.54%)
Cluster 2:
  A: 257 (39.18%)
  B: 58 (8.84%)
  C: 146 (22.26%)
  D: 195 (29.73%)
Cluster 3:
  A: 181 (37.79%)
  B: 47 (9.81%)
  C: 100 (20.88%)
  D: 151 (31.52%)


In [27]:
cluster_map = {0: 'B', 1: 'D', 2: 'A', 3: 'C'}

# Map clusters to groups
data_unsupervised['Predicted_Group'] = data_unsupervised['Cluster'].map(cluster_map)
data_unsupervised.head()

Unnamed: 0.1,Unnamed: 0,Age,Family_Size,Work_Experience,Energy_Consumption,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Profession_Artist,...,Preferred_Renewable_Biomass,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,Preferred_Renewable_Solar,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Cluster,Group,Predicted_Group
0,0.0,36.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,B,A
1,1.0,37.0,4.0,8.0,100.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,D,A
3,3.0,59.0,2.0,11.0,200.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2,A,A
5,5.0,47.0,5.0,0.0,200.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,A,A
6,6.0,61.0,3.0,5.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,C,A


Error in The predictions by K-Nearest Means

In [28]:
error = (data_unsupervised['Group'] != data_unsupervised['Predicted_Group']).sum()

# Calculate the percentage error
percentage_error = (error / len(data_unsupervised)) * 100

# Print the percentage error
print(f"Percentage Error: {percentage_error:.2f}%")


Percentage Error: 82.25%


It is clearly seen K-Means doesnt perform that well on the Data