In [None]:
cd ..

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from scripts import utils

In [None]:
train_csv = "dataset/train.csv"
dataset = pd.read_csv(train_csv)

In [None]:
# Grouping by 'object'
grouped = dataset.groupby("object")

# Display the grouped data
grouped.ngroups

In [None]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Step 5: Group by the 'object' column
grouped = dataset.groupby("object")


# Step 6: Function to clusterize each group
def clusterize_group(group, n_clusters=4):
    # Encode descriptions into numerical vectors
    embeddings = model.encode(group["description"].tolist())

    # Perform clustering (e.g., KMeans with n_clusters clusters)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    group["cluster"] = kmeans.fit_predict(embeddings)

    return group

In [None]:
# Step 7: Apply the clustering function to each group
df_clustered = grouped.apply(clusterize_group).reset_index(drop=True)

In [None]:
# Step 8: Display the DataFrame with cluster labels
clusters = df_clustered["cluster"]

# Target
target = df_clustered["target"]

In [None]:
clusters.tolist()

In [None]:
from itertools import permutations

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assuming df_clustered and clusters are defined and contain the relevant data
# df_clustered['target'] should be the actual target labels
# clusters should be the predicted cluster labels

# Calculate the initial confusion matrix
cm = confusion_matrix(df_clustered["target"], clusters)


# Find the optimal mapping to maximize accuracy
def find_best_mapping(conf_matrix):
    n_classes = conf_matrix.shape[0]
    best_accuracy = 0
    best_mapping = None

    for perm in permutations(range(n_classes)):
        # Permute confusion matrix based on current permutation
        permuted_cm = conf_matrix[np.ix_(range(n_classes), perm)]
        # Calculate accuracy
        accuracy = np.trace(permuted_cm) / np.sum(permuted_cm)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_mapping = perm

    return best_mapping


# Get the best mapping
best_mapping = find_best_mapping(cm)

# Apply the best mapping to the confusion matrix
optimal_cm = cm[np.ix_(range(cm.shape[0]), best_mapping)]

# Visualize the optimal confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    optimal_cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(cm.shape[1]), yticklabels=range(cm.shape[0])
)
plt.xlabel("Predicted Cluster")
plt.ylabel("Actual Target")
plt.title("Confusion Matrix with Optimal Mapping")
plt.show()

# Print accuracy of the optimal mapping
optimal_accuracy = np.trace(optimal_cm) / np.sum(optimal_cm)
print(f"Optimal Accuracy: {optimal_accuracy:.2f}")

In [None]:
# Visualizatio of Object - description - target
utils.plot_description_image_target("dataset/images/train", train_csv, 10)