In [490]:
import random
import numpy as np

# Vector for the data
X = np.loadtxt("x.txt")
Y = np.loadtxt("y.txt")

In [491]:
def initialize_centroids(k, dimensions):
    centroids = []

    for i in range(k):
        centroid = [np.random.random_sample() for a in range(dimensions)]
        centroids.append(centroid)
    
    return np.asarray(centroids)

In [492]:
def get_classes(examples, centroids):
    classes = np.zeros(X.shape[0], dtype = np.int8)

    for e in range(X.shape[0]):
        distances = np.zeros(len(centroids))
        for c in range(len(centroids)):
            distances[c] = np.sqrt(np.sum(np.power(X[e] - centroids[c], 2)))
    
        classes[e] = np.argmin(distances)
    
    return classes

In [493]:
def get_means(X, k, classes):
    means = np.zeros((k, X.shape[1]))

    for i in range(k):
        index = np.where(classes == i)
        means[i] = np.mean(X[index], axis = 0)
        
    return means

In [494]:
# Data for the initialization of the algorithm
k = 2
dimensions = X.shape[1]

# To make sure that each class has at least 1 element
counts = [0]
while(len(counts) != k):

    # Randomly initializing the centroids
    centroids = initialize_centroids(k, dimensions)

    # Getting the first classes
    classes = get_classes(X, centroids)

    # To see how many elements each class has
    unique, counts = np.unique(classes, return_counts = True)

# To see how many elements each class has
print("Cantidad de elementos por clase")
print(dict(zip(unique, counts)))

# Getting the means
means = get_means(X, k, classes)

Cantidad de elementos por clase
{0: 1130, 1: 194}


In [495]:
new_centroids = means.copy()
iterations = 2

for i in range(iterations):

    # Getting the first classes
    classes = get_classes(X, new_centroids)
    
    # Calcular las medias
    new_centroids = get_means(X, k, classes)
        
# To see how many elements each class has
unique, counts = np.unique(classes, return_counts = True)

# To see how many elements each class has
print("Cantidad de elementos por clase")
print(dict(zip(unique, counts)))

Cantidad de elementos por clase
{0: 1193, 1: 131}


In [496]:
cluster_zero = np.where(classes == 0)
cluster_one = np.where(classes == 1)

zero = Y[cluster_zero]
one = Y[cluster_one]

unique_zero, counts_zero = np.unique(zero, return_counts = True)
unique_one, counts_one = np.unique(one, return_counts = True)

In [497]:
print("For cluster one")
try:
    counts_zeroa = counts_zero[0]
    
except:
    counts_zeroa = 0
    
try:
    counts_zerob = counts_zero[1]
    
except:
    counts_zerob = 0

print("Ham {0} Spam {1}".format(counts_zeroa, counts_zerob))

print("For cluster two")
try:
    counts_onea = counts_one[0]

except:
    counts_onea = 0
    
try:
    counts_oneb = counts_one[1]

except:
    counts_oneb = 0
    
print("Ham {0} Spam {1}".format(counts_onea, counts_oneb))

For cluster one
Ham 871 Spam 322
For cluster two
Ham 131 Spam 0
