In genomics, it is common to use mRNA expression data to find clusters of similarly expressed genes to help understand their function. One very robust clustering algorithm is known as the k-means clustering algorithm. This algorithm takes as input the number of clusters to find, k, and the input data. The input data can be multidimensional data, but let’s first consider gene expression data in three tissues, so each gene has three coordinates describing its normalized expression level in each tissue: x,y, and z. The kmeans algorithm is defined as follows:

In [6]:
from random import randrange
from math import sqrt

class KMeans:
    """Object Mean"""
    
    def __init__(self, mean, close_coords):
        """Object Constructor for Mean. Creates a Mean"""
        self.mean = mean
        self.close_coords = close_coords
    
    def getMean(self):
        """Returns the mean"""
        return self.mean

    def getCloseCoords(self):
        """Returns a list of the closest coordinates"""
        return self.close_coords

    def setMean(self, mean):
        """Sets the mean"""
        self.mean = mean

    def appendCloseCoords(self, coordinates):
        """appends a coordinate to the list of closest coordinates of a mean"""
        self.close_coords.append(coordinates)

    def setCloseCoords(self, close_coords):
        """Sets the closest coordinates"""
        self.close_coords = close_coords

    def __str__(self):
        """Prints out Mean with Closest Coords"""
        return 'Mean: {0}\nClosest Coordinates: {1}'.format(self.mean, self.close_coords)


In [4]:
def L2Distance(coordinates, mean):
    numDimensions = len(coordinates)
    return math.sqrt(sum([coordinates[i]-mean[i]**2 for i in range(numDimensions)]))

In [None]:
def kmeans(mean_list, coordinates_list):
    only_means = []
    only_new_means = []
    distance_list = []
    count = 1
    for mean in mean_list:
        only_means.append(mean.getMean())
        

    #This while loop continues until the old mean list is no longer the new mean list.
    while True:
        #This for loop assigns every coordinate to the closest mean.
        for coordinates in coordinates_list:
            #This for loop creates a list of distances of a cooordinate to the different means respectively
            for mean in mean_list:
                distance_list.append(distance(coordinates, mean.getMean()))
            
            shortest_distance_index = find_shortest_distance(distance_list)
            mean_list[shortest_distance_index].appendCloseCoords(coordinates)

            distance_list = []

        #Averages all the coordinate points for each mean
        for mean in mean_list:
            x_tot, y_tot, z_tot = 0, 0, 0
            
            #Total XYZ
            for coordinates in mean.getCloseCoords():
                x_tot = x_tot + coordinates[0]
                y_tot = y_tot + coordinates[1]
                z_tot = z_tot + coordinates[2]

            #This takes the average
            if len(mean.getCloseCoords()) == 0:
                only_new_means.append([x_tot, y_tot, z_tot])
            else:
                only_new_means.append([x_tot/len(mean.getCloseCoords()), y_tot/len(mean.getCloseCoords()), z_tot/len(mean.getCloseCoords())])

       
        #if the new mean is equal to the old mean, break from loop
        if only_means == only_new_means:
            break

        #Updates mean_list and only_means, resets only_new_means
        only_means = []
        for mean in mean_list:
            mean.setMean(only_new_means.pop(0))
            only_means.append(mean.getMean())
            mean.setCloseCoords([])
        only_new_means = []

    return mean_list

In [None]:
def part1():
    infilename = input("Enter a file name for K-Means Clustering: ")
    clusters = open(infilename, "r")
    clusters_list = clusters.readlines()
    length = len(clusters_list)

    coordinates_list = []
    #This for loop creates a list of coordinates, with each coordinate a list of three floats: x, y, and z
    for coordinates in clusters_list:
        coordinates = coordinates.split()
        coordinates[0] = float(coordinates[0])
        coordinates[1] = float(coordinates[1])
        coordinates[2] = float(coordinates[2])
        coordinates_list.append(coordinates)
        
    for k in range(2,7):
        index_list = []
        mean_list = []
        #This for loop randomly selects "k" number of coordinates as the mean.
        #The mean is stored in the object "Mean", which not only contains the mean, but the closest coordinates
        #associated with that mean
        for i in range(k):
            rand = randrange(0, len(coordinates_list))
            while (rand in index_list) == True:
                rand = randrange(0, length)
            index_list.append(rand)
            mean_list.append(Mean(coordinates_list[rand],[]))
            
        converged_mean_list = kmeans(mean_list, coordinates_list)

        cluster_index = 1
        
        print('K =', k)
        print('Converged Mean')

        outfile = open("k"+str(k)+".dat", "w")
        for mean in converged_mean_list:
            print(mean.getMean())
            for coordinates in mean.getCloseCoords():
                str_coordinates = ''
                for coordinate in coordinates:
                    str_coordinates = str_coordinates + str(coordinate) + '\t'
                str_coordinates = str_coordinates + str(cluster_index)
                print(str_coordinates, file = outfile)
            cluster_index = cluster_index + 1

        print('\n', file = outfile)
        
        cluster_index = 1
        print()