In [35]:
import random
import math
import numpy as np

In [36]:
def readArchive(fileName):
    with open(fileName, 'r') as file:
        lines = file.readlines()

    # Elimina los caracteres de salto de línea y divide los valores por comas
    data = [line.strip().split(',') for line in lines]

    # Convierte la lista en una matriz de numpy
    return np.array(data, dtype=int)

dataTrain = readArchive('optdigits.tra')

X_train = dataTrain[:, :64]
Y_train = dataTrain[:, 64]

dataTest = readArchive('optdigits.tes')
X_test = dataTest[:, :64]
Y_test = dataTest[: , 64]

In [37]:
class KMeans:
    def __init__(self, k, max_iterations=100):
        self.k = k
        self.max_iterations = max_iterations

    def fit(self, data):
        self.centroids = self._initialize_centroids(data)
        self.clusters = [[] for _ in range(self.k)]

        for _ in range(self.max_iterations):
            self._assign_clusters(data)
            prev_centroids = self.centroids.copy()
            self._update_centroids(data)
            if self._has_converged(prev_centroids):
                break

    def predict(self, data):
        labels = []
        for point in data:
            label = self._get_label(point)
            labels.append(label)
        return labels

    def _initialize_centroids(self, data):
        centroids_indices = random.sample(range(len(data)), self.k)
        centroids = [data[i] for i in centroids_indices]
        return centroids

    def _assign_clusters(self, data):
        self.clusters = [[] for _ in range(self.k)]
        for point in data:
            centroid_index = self._closest_centroid(point)
            self.clusters[centroid_index].append(point)

    def _update_centroids(self, data):
        for i in range(self.k):
            if self.clusters[i]:
                centroid = self._calculate_centroid(self.clusters[i])
                self.centroids[i] = centroid

    def _closest_centroid(self, point):
        distances = [self._euclidean_distance(point, centroid) for centroid in self.centroids]
        return distances.index(min(distances))

    def _euclidean_distance(self, image1, image2):
        squared_distance = sum((pixel1 - pixel2) ** 2 for pixel1, pixel2 in zip(image1, image2))
        return math.sqrt(squared_distance)


    def _calculate_centroid(self, cluster):
        dimensions = len(cluster[0])
        centroid = [sum(cluster_pixels[d]) / len(cluster) for d in range(dimensions)]
        return centroid


    def _has_converged(self, prev_centroids):
        return self.centroids == prev_centroids

    def _get_label(self, point):
        return self._closest_centroid(point)


In [38]:
data = list(zip(X_test, Y_test))
kmeans = KMeans(k=3)  # Elige el número de grupos que deseas.
kmeans.fit(data)
cluster_labels = kmeans.predict(data)

TypeError: only size-1 arrays can be converted to Python scalars

In [69]:
import random
import math
import numpy as np

class KMeans:
    def __init__(self, k, max_iterations=100):
        self.k = k
        self.max_iterations = max_iterations

    def fit(self, data):
        self.centroids = self._initialize_centroids(data)
        self.clusters = [[] for _ in range(self.k)]

        for _ in range(self.max_iterations):
            self._assign_clusters(data)
            prev_centroids = self.centroids.copy()
            self._update_centroids(data)
            if self._has_converged(prev_centroids):
                break

    def predict(self, data):
        labels = []
        for point in data:
            label = self._get_label(point)
            labels.append(label)
        return labels

    def _initialize_centroids(self, data):
        indices = random.sample(range(len(data)), self.k)
        centroids = [data[i] for i in indices]
        return centroids

    def _assign_clusters(self, data):
        self.clusters = [[] for _ in range(self.k)]
        for point in data:
            centroid_index = self._closest_centroid(point)
            self.clusters[centroid_index].append(point)

    def _update_centroids(self, data):
        for i in range(self.k):
            if self.clusters[i]:
                centroid = self._calculate_centroid(self.clusters[i])
                self.centroids[i] = centroid

    def _closest_centroid(self, point):
        distances = [self._euclidean_distance(point, centroid) for centroid in self.centroids]
        return distances.index(min(distances))

    def _euclidean_distance(self, point1, point2):
        squared_distance = np.sum((point1 - point2) ** 2)
        return math.sqrt(squared_distance)

    def _calculate_centroid(self, cluster):
        centroid = np.mean(cluster, axis=0)
        return centroid

    def _has_converged(self, prev_centroids):
        return np.array_equal(self.centroids, prev_centroids)

    def _get_label(self, point):
        return self._closest_centroid(point)


# Supongamos que tienes las imágenes en un arreglo de NumPy de forma (100, 64) y las etiquetas en un arreglo de forma (100,)
images = X_train
labels = Y_train

# Crear una instancia de KMeans y ajustar los datos
kmeans = KMeans(k=10)  # Número de grupos = 10
kmeans.fit(images)
cluster_labels = kmeans.predict(images)

confussion_matrix = np.zeros((10,10))

# Imprimir las etiquetas asignadas a cada imagen
for i in range(len(images)):
    confussion_matrix[labels[i]][cluster_labels[i]] += 1
    #print(f"Imagen {i}: Etiqueta real = {labels[i]}, Etiqueta asignada = {cluster_labels[i]}")

print((np.trace(confussion_matrix))/np.sum(confussion_matrix))


0.009678263144127649


In [68]:
print((np.trace(confussion_matrix))/np.sum(confussion_matrix))

0.1909495160868428


In [70]:
import numpy as np
from sklearn.datasets import load_digits

# Load the dataset
digits = load_digits()
X = X_train

# Initialize the centroids
k = 10
centroids = X[np.random.choice(X.shape[0], k, replace=False)]

# Assign clusters
for i in range(1000):
    distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
    labels = np.argmin(distances, axis=0)

    # Update centroids
    for j in range(k):
        centroids[j] = X[labels == j].mean(axis=0)

# Predict the class of new images
new_image = X_test[0, :]
distances = np.sqrt(((new_image - centroids[:, np.newaxis])**2).sum(axis=2))
label = np.argmin(distances, axis=0)
print(label)

[7]


In [71]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import homogeneity_score

# Load the dataset
digits = load_digits()
X = digits.data
y = digits.target


# Initialize the centroids
k = 10
centroids = X[np.random.choice(X.shape[0], k, replace=False)]

# Assign clusters
for i in range(100):
    distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
    labels = np.argmin(distances, axis=0)

    # Update centroids
    for j in range(k):
        centroids[j] = X[labels == j].mean(axis=0)

# Calculate homogeneity score
score = homogeneity_score(y, labels)
print("Homogeneity score:", score)


Homogeneity score: 0.7041310190116059
