In [10]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the datasets
train_data = pd.read_csv('Sets/train_data.csv')
val_data = pd.read_csv('Sets/val_data.csv')

# Separate features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']

# Perform K-Means clustering with 8 clusters
kmeans = KMeans(n_clusters=8, random_state=42)
kmeans.fit(X_train)

# Predict the clusters for the validation set
val_clusters = kmeans.predict(X_val)

# Map clusters to labels
# For each cluster, determine the most frequent label in the training data
cluster_label_mapping = {}
for cluster in range(8):
    mask = (kmeans.labels_ == cluster)
    most_common_label = y_train[mask].mode()[0]
    cluster_label_mapping[cluster] = most_common_label

# Convert clusters to predicted labels
val_predicted_labels = [cluster_label_mapping[cluster] for cluster in val_clusters]

# Evaluate accuracy
accuracy = accuracy_score(y_val, val_predicted_labels)
print(f'Validation set accuracy: {accuracy:.2f}')

# Optional: Print the mapping from clusters to labels
print("Cluster to label mapping:")
for cluster, label in cluster_label_mapping.items():
    print(f"Cluster {cluster}: Label {label}")

Validation set accuracy: 0.27
Cluster to label mapping:
Cluster 0: Label 2
Cluster 1: Label 5
Cluster 2: Label 2
Cluster 3: Label 6
Cluster 4: Label 2
Cluster 5: Label 5
Cluster 6: Label 5
Cluster 7: Label 5


In [9]:
train_data['label'].value_counts()

5    3553
2    3372
6    2442
0    2225
3    1706
4    1696
1     751
7     740
Name: label, dtype: int64