In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from numpy.random import uniform
from sklearn.datasets import make_blobs
from sklearn.datasets import make_moons as moon
import seaborn as sns
import random
import json

## Data generating functions

In [None]:
# Generating data with one true cluster
def generate_data_uniform(n):
    datum = np.random.uniform(low=-1, high=1, size=[n, d-1])
    last_column = datum.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_uniform_sq(n):
    datum = np.random.uniform(low=-1, high=1, size=[n, d-1])
    datum_sq = datum**2
    last_column = datum_sq.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_uniform_cube(n):
    datum = np.random.uniform(low=-1, high=1, size=[n, d-1])
    datum_cub = datum**3
    last_column = datum_cub.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_uniform_prod(n):
    datum = np.random.uniform(low=-1, high=1, size=[n, d-1])
#     datum_cub = datum**3
    last_column = datum.prod(axis = 1)
    data = np.column_stack((datum, last_column))
    return data


def generate_data_normal(n):
    datum = np.random.normal(loc=0.0, scale=1.0, size=[n, d-1])
    last_column = datum.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_normal_sq(n):
    datum = np.random.normal(loc=0.0, scale=1.0, size=[n, d-1])
    datum_sq = datum**2
    last_column = datum_sq.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_normal_cube(n):
    datum = np.random.normal(loc=0.0, scale=1.0, size=[n, d-1])
    datum_sq = datum**3
    last_column = datum_sq.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_normal_prod(n):
    datum = np.random.normal(loc=0.0, scale=1.0, size=[n, d-1])
    last_column = datum.prod(axis = 1)
    data = np.column_stack((datum, last_column))
    return data


def generate_data_gama(n):
    datum = np.random.gamma(1,1, size=(n,d-1))
    last_column = datum.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_gama_sq(n):
    datum = np.random.gamma(1,1, size=(n,d-1))
    datum_sq = datum**2
    last_column = datum_sq.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_gama_cube(n):
    datum = np.random.gamma(1,1, size=(n,d-1))
    datum_sq = datum**3
    last_column = datum_sq.sum(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
def generate_data_gama_prod(n):
    datum = np.random.gamma(1,1, size=(n,d-1))
    last_column = datum.prod(axis = 1)
    data = np.column_stack((datum, last_column))
    return data
# def generate_data_example(n):
#     data = []
#     for _ in range(n):
#         xs = []
#         for dimension in range(d - 1):
#             x = np.random.uniform(low=-1, high=1)
#             xs.append(x)
#         y = sum(xs)
#         datum = xs + [y]
#         data.append(datum)
#     return data



# Generating data with two true clusters

def generate_data_unif_2clusters(n):
    datum = []
    for dimension in range(d-1):
        mu = 0
        n1 = int(round(n/2,1))
        column = np.random.uniform(low=mu, high=mu+5, size = n1)
        # x1.append(column)
        n2 = int(n-round(n/2,1))
        increment = 6
        column2 = np.random.uniform(low=mu+increment, high=mu+2*increment, size = n2)
        col = np.concatenate((column,column2))
        datum.append(col)
    datum = np.transpose(datum)
    datum_sq = datum**2
    last_column = datum_sq.sum(axis= 1)
    data = np.column_stack((datum, last_column))
    return data

def generate_data_normal_2clusters(n):
    datum = []
    for dimension in range(d-1):
        mu = 0
        n1 = int(round(n/2,1))
        column = np.random.normal(loc=mu, scale=1.0, size = n1)
        # x1.append(column)
        n2 = int(n-round(n/2,1))
        increment = 6
        column2 = np.random.normal(loc=mu+increment, scale=1.0, size = n2)
        col = np.concatenate((column,column2))
        datum.append(col)
    datum = np.transpose(datum)
    datum_sq = datum**2
    last_column = datum_sq.sum(axis= 1)
    data = np.column_stack((datum, last_column))
    return data

def generate_data_conic(n):
    n1 = round(n/2)+1
    datum = -np.random.gamma(1,1,(n1,d))
    reversed_ind = list(reversed(np.arange(d)))
    datum1 = -datum[:, reversed_ind]
    data = np.concatenate((datum, datum1), axis =0 )[:n]
    return data

def generate_data_rectangle(n):
    n1 = round(n/2)+1
    datum = -np.random.gamma(1,1,(n1,d))
    reversed_ind = list(reversed(np.arange(d)))
    datum1 = -5-datum[:, reversed_ind]
    data = np.concatenate((datum, datum1), axis =0 )[:n]
    return data


def generate_data_unif_normal_clusters(n):
    data = []
    for dimension in range(d):
        mu = 0
        n1 = int(round(n/2,1))
        column = np.random.uniform(low=mu, high=mu+5, size = n1)
        # x1.append(column)
        n2 = int(n-round(n/2,1))
        increment = 6
        column2 = np.random.normal(loc=mu+increment, scale = 1.0, size = n2)
        col = np.concatenate((column,column2))
        data.append(col)
    data = np.transpose(data)
    return data

# Generating data with 3 true clusters

def generate_data_unif_3clusters(n):
    data = []
    for dimension in range(d):
        # mu = np.random.randint(4)
        mu = 0
        n1 = int(round(n/3,1))
        increment = 6
        column = np.random.uniform(low=mu, high=mu+increment, size = n1)

        increment = 6
        column2 = np.random.uniform(low=mu+increment, high=mu+2*increment, size = n1)

        n2 = int(n-round(n/3,1))+3
        increment = 6
        column3 = np.random.uniform(low=mu+2*increment, high=mu+3*increment, size = n2)

        col = np.concatenate((column, column2, column3))[:n]
        data.append(col)
    data = np.transpose(data)
    return data

def generate_data_normal_3clusters(n):
    data = []
    for dimension in range(d):
        # mu = np.random.randint(4)
        mu = 0
        n1 = int(round(n/3,1))
        increment = 6
        column = np.random.normal(loc=mu, scale = 1, size = n1)

        increment = 6
        column2 = np.random.normal(loc=mu+increment, scale = 1, size = n1)

        n2 = int(n-round(n/3,1))+3
        increment = 6
        column3 = np.random.normal(loc=mu+2*increment, scale = 1, size = n2)

        col = np.concatenate((column, column2, column3))[:n]
        data.append(col)
    data = np.transpose(data)
    return data

def generate_data_rectangle_norm(n):
    n1 = round(n/3)+1
    datum = -np.random.gamma(1,1, size = (n1,d))
    datum1 = np.random.normal(loc = -5, scale = 1 , size = (n1,d))
    reversed_ind = list(reversed(np.arange(d)))
    datum2 = -10-datum[:, reversed_ind]
    data = np.concatenate((datum, datum1, datum2), axis =0 )[:n]
    return data

def generate_data_conic_norm(n):
    n1 = round(n/3)+1
    datum = -np.random.gamma(1,1, size =(n1,d))
    datum1 = np.random.normal(loc = 2.5, scale = 1 , size = (n1,d))
    reversed_ind = list(reversed(np.arange(d)))
    datum2 = 6-datum[:, reversed_ind]
    data = np.concatenate((datum, datum1, datum2), axis =0 )[:n]
    return data

# Special data

def generate_data_makeblobs(n, d = 2, n_clusters = 3):
    '''
    Generates a 2d clustered data
    '''
    data,_ = make_blobs(n_samples=n, centers = n_clusters, n_features = d, random_state=42)
    return data
def generate_data_moon(n, d = 2):
    '''
    Generates a 2d moon shaped data
    '''
    if d == 2:
        X, y = moon(n, noise=0.092)
        return X
    else:
        pass


# Data generating methods
generate_data_methods = {
    # Data with one true cluster
    'uniform': generate_data_uniform,
    'uniform_sq': generate_data_uniform_sq,
    'uniform_cube': generate_data_uniform_cube,
    'uniform_prod': generate_data_uniform_prod,
    'normal': generate_data_normal,
    'normal_sq': generate_data_normal_sq,
    'normal_cube': generate_data_normal_cube,
    'normal_prod': generate_data_normal_prod,
    'gamma': generate_data_gama,
    'gamma_sq': generate_data_gama_sq,
    'gamma_cube': generate_data_gama_cube,
    'gamma_prod': generate_data_gama_prod,
    # Data with two true clusters
    'uniform_2clust': generate_data_unif_2clusters,
    'normal_2clust': generate_data_normal_2clusters,
    'conic': generate_data_conic,
    'rectangle': generate_data_rectangle,
    'unif_normal': generate_data_unif_normal_clusters,
    # Data with three true clusters
    'unif_3clust': generate_data_unif_3clusters,
    'normal_3clust': generate_data_normal_3clusters,
    'conic_norm': generate_data_conic_norm,
    'rectangle_norm': generate_data_rectangle_norm,
    # Special Data
    'makeblobs': generate_data_makeblobs,
    'moon': generate_data_moon
}
data_keys = list(generate_data_methods.keys())

## Online Balanced kmeans implementation

In [None]:
class OnlineBalancedKmeans:
    def __init__(self, k, d, alpha, beta, inference_method='merge_2', data_distribution = 'uniform'):
        self.k = k
        self.d = d
        # min_value
        # max_value

        # start editting: 05 / 07/ 2023
        # Initialising clusters based on the data type
        self.cluster_means = generate_data_methods[data_distribution](self.k)
#         if generate_data == 'uniform':
#             # self.cluster_means = np.random.uniform(low=-1, high=1, size=[k,d])
#             self.cluster_means = np.random.choice()
#         elif generate_data = 'normal':
#             self.cluster_means = np.random.normal(loc = 0, scale = 1, size = [k,d])
#         elif generate_data == 'gamma':
#             self.cluster_means = np.random.gamma(1,1, size = [k,d])
        # self.cluster_means = np.random.uniform(low=min_value, high=max_value, size=[k,d])
        # 05 / 07/ 2023 :end editting

        self.cluster_counts = np.ones(k)
        self.alpha = alpha
        self.beta = beta
        self.cluster_indices = []

        self.infer = None
        method = inference_method['method']
        del inference_method['method']
        if method == 'euclidean_distance':
            self.infer = lambda point: self.infer_euclid_dist(point, **inference_method)
        elif method == 'norm_weights':
            self.infer = lambda point: self.infer_norm_weights(point, **inference_method)
        elif method == 'cluster_size':
            self.infer = lambda point: self.infer_cluster_size(point, **inference_method)
        elif method == 'weights':
            self.infer = lambda point: self.infer_weights(point, **inference_method)
        elif method == 'merge':
            self.infer = lambda point: self.infer_merge_norm_cl_size(point, **inference_method)
        elif method == 'merge_2':
            self.infer = lambda point: self.infer_merge_norm_ecl_dis(point, **inference_method)
        elif method == 'cls_sz_exp':
            self.infer = lambda point: self.infer_cluster_size_exp(point, **inference_method)
        #assert self.infer is not None



    def compute_weights(self):
        '''
        return weights that penalize clusters with
        many points during assignments
        '''
        return np.array([self.beta*(np.mean(self.cluster_counts) - cluster_count) / (np.std(self.cluster_counts) + 1e-9) for cluster_count in self.cluster_counts])

    def assign_and_update(self, point):
        """
        Does an online k-means update on a single data point.
        Args:
            point - a 1 x d array
        Returns:
            An integer in [0, k-1] indicating the assigned cluster.
        Updates cluster_means and cluster_counts in place.
        For initialization, random cluster means are needed.
        """

        cluster_distances = np.sum(np.sqrt((point - self.cluster_means)**2), axis = 1)
        #cluster_distances = np.zeros(self.k)
        # for cluster in range(self.k):
        #     cluster_distances[cluster] = sum(np.sqrt((point - self.cluster_means[cluster])**2))
        #     # cluster_distances[cluster] = sum(np.sqrt((point - self.cluster_means[cluster])**2))
        cluster_weights = self.compute_weights()
        cluster_index = np.argmin(cluster_distances - cluster_weights)
        self.cluster_counts[cluster_index] += 1
        self.cluster_means[cluster_index] = self.alpha * np.array(point) + (1 - self.alpha) * self.cluster_means[cluster_index]
        self.cluster_indices.append(cluster_index)
        #print(cluster_index, cluster_distances[cluster_index], cluster_weights[cluster_index], cluster_distances[0], cluster_weights[0])
        return cluster_index

    def final_clusters(self, data):
        centroids = []
        labels = []
        cluster_weights = self.compute_weights()
        for x in data:
            dists = cluster_distances = np.sum(np.sqrt((x - self.cluster_means)**2), axis = 1)
            label = np.argmin(dists-cluster_weights)
            #centroids.append(self.centroids[label])
            labels.append(label)
        # cluster_counts = np.unique(labels, return_counts= True)
        return labels

    def view_final_clusters(self, data):# for 2d
        for index, data_point in enumerate(data):
            self.assign_and_update(data_point)
        labels = self.final_clusters(data)
        class_indices = self.cluster_indices
        #print(len(class_indices),len(true_labels[:index]))
        sns.scatterplot(x=[X[0] for X in data],
                      y=[X[1] for X in data],
                      hue=labels,
                      #style=true_labels,
                      palette="deep",
                      legend=None
                      )
        class_centers = self.cluster_means
        class_counts = np.unique(labels, return_counts=True)[1]
        # class_counts = kmeans.cluster_counts
        X = [x for x, _ in class_centers]
        Y = [y for _, y in class_centers]
        plt.plot(X, Y, 'r+', markersize=10, )
        # add labels to all points
        # np.unique(labels, return_counts=True)
        for (xi, yi, counts) in zip(X, Y, class_counts):
            plt.text(xi, yi, counts, va='bottom', ha='center')
        plt.title('Final iteration')
        plt.show()

    def infer_old(self, point):
        # Warning: Only works for d=2. TODO: Generalize to arbitrary dimensions.
        # Find closest mean in x coordinate.
        distances = np.zeros(self.k)
        for cluster in range(self.k):
            distances[cluster] = (point[0] - self.cluster_means[cluster, 0])**2
        cluster_index = np.argmin(distances)
        # Return y coordinate of closest mean.
        return self.cluster_means[cluster_index, 1]

    def infer_euclid_dist(self, point):
        """
        point = (d,) array ie (1 by d)
        """

        # Find the closest mean in the known coordinates
        distances = np.sqrt( np.sum( ( point[ : self.d-2 ] - self.cluster_means[ :, : self.d-2] )**2, axis = 1 ) )
        cluster_index = np.argmin( distances )
        inferred_val = self.cluster_means[cluster_index, self.d-1]
        # Return the unknown coordinates of the closest mean
        return inferred_val

    def infer_norm_weights(self, point, beta = 7):
        distances = np.sqrt( np.sum( ( point[ : self.d-2 ] - self.cluster_means[ :, : self.d-2] )**2, axis = 1 ) )
        weights = np.exp(-beta*distances)
        normalized_weights = weights / sum(weights)
        # Return the estimate of the d_th coordinates of the closest mean using the weights
        inferred_val = sum(normalized_weights * self.cluster_means[:,d-1])
        return inferred_val

    def infer_cluster_size(self, point):
        point = np.array(point)
        # Find the 5 closest cluster means to the point and the total number of points in those clusters
        distances =  np.sum( ( point[ : self.d-2 ] - self.cluster_means[ :, : self.d-1] )**2, axis = 1 )
        cluster_indices = []
        for cluster in range(5):
            index = np.argmin( distances )
            cluster_indices.append(index)

            distances = np.delete(distances, index)
            #print("index:{} \n dist:{} \n means:{} \n count:{}".format(index, distances[:3], self.cluster_means[:3], self.cluster_counts[index]))

        total_counts = np.sum(self.cluster_counts[cluster_indices])
        #print(total_counts)
        # Compute their means
        inferred_vals = self.cluster_counts[cluster_indices]/total_counts @ self.cluster_means[cluster_indices,:]
        inferred_val = inferred_vals[self.d-1]
        return inferred_val

    def infer_weights(self, point, alpha=0.4, beta=7):
        overall_mean = self.cluster_counts @ self.cluster_means[:,d-1]/sum(self.cluster_counts)
        distances = np.sqrt( np.sum( ( point[ : self.d-2 ] - self.cluster_means[ :, : self.d-2] )**2, axis = 1 ) )
        weights = np.exp(-beta*distances)
        normalized_weights = weights / sum(weights)
        inferred_val = alpha*overall_mean + (1.0-alpha)*sum(normalized_weights * self.cluster_means[:,d-1])
        return inferred_val

    def infer_merge_norm_cl_size(self, point, alpha=0.8, beta = 7):
        infered_val = alpha * self.infer_norm_weights(point, beta) + (1 - alpha) * self.infer_cluster_size(point)
        return infered_val


    def infer_merge_norm_ecl_dis(self, point, alpha=0.9, beta = 7):
        infered_val = alpha * self.infer_norm_weights(point, beta) + (1 - alpha) * self.infer_euclid_dist(point)
        return infered_val
    def infer_cluster_size_exp(self, point, beta=8):
        # Find the 5 closest cluster means to the point and the total number of points in those clusters
        distances =  np.sum( ( point[ : self.d-2 ] - self.cluster_means[ :, : self.d-1] )**2, axis = 1 )
        cluster_indices = []
        for cluster in range(5):
            index = np.argmin( distances )
            cluster_indices.append(index)

            distances = np.delete(distances, index)
            #print("index:{} \n dist:{} \n means:{} \n count:{}".format(index, distances[:3], self.cluster_means[:3], self.cluster_counts[index]))

        total_counts = np.sum(self.cluster_counts[cluster_indices])
        #print(total_counts)
        # Compute their means
        inferred_vals = np.exp(-beta * self.cluster_counts[cluster_indices]/total_counts ) @ self.cluster_means[cluster_indices,:]
        inferred_val = inferred_vals[self.d-1]
        return inferred_val

# Optimised Inference Methods
inference_methods = [
    {'method': 'euclidean_distance'}, {'method': 'norm_weights', 'beta': 7 }, {'method': 'cluster_size'}
]+[
    {'method': 'weights', 'alpha': 0.4, 'beta':7 }
]+ [
    {'method': 'merge', 'alpha': 0.8, 'beta': 7}
]+ [
    {'method': 'merge_2', 'alpha': 0.9, 'beta': 7}
] + [
    {'method': 'cls_sz_exp', 'beta': 8}
]

kmeans = OnlineBalancedKmeans(k = 4, d = 2, alpha = 0.5, beta = 0.5, inference_method=inference_methods[1])

inference_functions = [
                kmeans.infer_euclid_dist,
                kmeans.infer_norm_weights,
                kmeans.infer_cluster_size,
                kmeans.infer_weights,
                kmeans.infer_merge_norm_cl_size,
                kmeans.infer_merge_norm_ecl_dis,
                kmeans.infer_cluster_size_exp
            ]
inf_meths = [inference_methods[i]['method'] for i in range(7)]
# sns.set_theme(style="darkgrid")

KeyError: 'method'

## Plot Functions

### Ploting the errors for a generated data

In [None]:
def plot_errors(
    error_evol_data, ## dim = (len(inf_meths) , any natural num.)
    data_key = '',
    fname='.png',
    title = 'Errors for each inference methods',
    x = list(range(0,10001, 1000)),
    save = False
):
    fig, (ax1) = plt.subplots( 1,1, figsize = (10, 7) )
    thres = 2*10**20
    legends = []
    for inf_index, inf_meth in enumerate(inf_meths):
        if error_evol_data[inf_index,0] <= thres:
            legends.append(inf_meth)
            ax1.plot(x, error_evol_data[inf_index])
            ax1.set_xlabel('Number of datapoint assigned')
            ax1.set_ylabel('Inference errors')
            ax1.legend(legends)
    plt.suptitle('{}{}'.format(title, data_key))

    if save == True: plt.savefig('{}{}'.format(data_key, fname))


# Ploting the errors of all the data seperately
# for data, data_key in zip(data_iter, data_dist_keys):
#     plot_errors(data = data.T, title='Errors for each inference methods:', data_key = data_key)

### Plots of clusters after every 'n_assigned' data points

In [None]:

def plot_obk(
    data_key, k=20, d=2, alpha=0.3, beta=0.07, n_train=100,
    inference_method={'method': 'weights', 'alpha': 0.0},
    n_assigned = 200,
    save = False
):
    X_train = generate_data_methods[data_key](n_train)
    kmeans = OnlineBalancedKmeans(k, d, alpha, beta, inference_method=inference_method)
    for index, data_point in enumerate(X_train):
        kmeans.assign_and_update(data_point)
        labels = kmeans.cluster_indices
        if index % n_assigned == 0 or index+1 == len(X_train ):
            class_centers = kmeans.cluster_means
            class_labels = kmeans.cluster_indices
            sns.scatterplot(x=[X[0] for X in X_train[:index,:]],
                            y=[X[1] for X in X_train[:index,:]],
                            hue=class_labels[:index],
                            # style=true_labels[:index],
                            palette="deep",
                            legend=None
                            )
            plt.plot([x for x, _ in class_centers],
                    [y for _, y in class_centers],
                    'r+',
                    markersize=10,
                    )
            plt.title('iteration {}'.format(index))
            plt.show()

    sns.scatterplot(x=[X[0] for X in X_train],
                    y=[X[1] for X in X_train],
                    hue=labels,
                    #style=true_labels,
                    palette="deep",
                    legend=None
                    )
    class_centers = kmeans.cluster_means
    class_counts = np.unique(labels, return_counts=True)[1]
    X = [x for x, _ in class_centers]
    Y = [y for _, y in class_centers]
    plt.plot(X, Y, 'r+', markersize=10, )
    for (xi, yi, counts) in zip(X, Y, class_counts):
        plt.text(xi, yi, counts, va='bottom', ha='center')
    plt.title('Final iteration of {} data'.format(data_key))
    if save == True: plt.savefig('Final iteration_{}.png'.format(data_key))
    plt.show()

# plot_obk(data_key='uniform')

### Ploting final iteration

In [None]:
def plot_final_assignment(
    data_key,
    inference_method={'method': 'weights', 'alpha': 0.0},
    k = 20,
    d = 2,
    alpha = 0.3,
    beta = 0.01,
    n_train = 100
):
    # Create k-means clustering object
    kmeans = OnlineBalancedKmeans(k, d, alpha, beta, inference_method=inference_method.copy(), data_distribution=data_key)
    # Create a dataset of 2D distributions
    X_train = generate_data_methods[data_key](n_train)

    for index, data_point in enumerate(X_train):
        kmeans.assign_and_update(data_point)
    labels = kmeans.final_clusters(X_train)
    # class_indices = kmeans.cluster_indices
    sns.scatterplot(x=[X[0] for X in X_train],
                    y=[X[1] for X in X_train],
                    hue=labels,
                    #style=true_labels,
                    palette="deep",
                    legend=None
                    )
    class_centers = kmeans.cluster_means
    class_counts = np.unique(labels, return_counts=True)[1]
    X = [x for x, _ in class_centers]
    Y = [y for _, y in class_centers]
    plt.plot(X, Y, 'r+', markersize=10, )
    for (xi, yi, counts) in zip(X, Y, class_counts):
        plt.text(xi, yi, counts, va='bottom', ha='center')
    plt.title('Final iteration after clustering')
#     plt.xlabel('N(0,1)')
#     plt.ylabel('N(0,1)')
    plt.savefig('normal_final_ite.png')
    plt.show()

# Examples
#plot_final_assignment(data_key = 'gamma')

### A 3d view of the online balanced k-means

In [None]:
# Visualise the obk in 3d
def plot_obk_3d():
    d = 3
    for data_dist_key in data_dist_keys:
        X_train = generate_data_methods[data_dist_key](1000)
        kmeans = OnlineBalancedKmeans(k, d, alpha = 0.2, beta= 0.01, inference_method={'method': 'weights', 'alpha': 0.4, 'beta':7 }, data_distribution = data_dist_key)
        for index, data_point in enumerate(X_train):
            kmeans.assign_and_update(data_point)
            # if index == len(X_train )- 1:
        labels = kmeans.final_clusters(X_train)
        sns.set_style ("darkgrid")
        plot_mean = 3
        min_num = 30
        x=[X[0] for X in X_train[:index+1,:]]
        y=[X[1] for X in X_train[:index+1,:]]
        z=[X[2] for X in X_train[:index+1,:]]
    #         plot1 =  np.random.normal (plot_mean, 1, size = min_num)
    #         plot2 = np.random.normal (plot_mean, 1, size = min_num)
    #         plot3 = np.random.normal (plot_mean, 1, size = min_num)
        plt.figure (figsize = (5, 4))
        seaborn_plot = plt.axes (projection='3d')
        print (type (seaborn_plot))
        seaborn_plot.scatter3D (x, y, z, c= labels[:index+1], alpha = 1)
        # seaborn_plot.scatter3D (plot1, plot2, plot3)
        seaborn_plot.set_xlabel ('x')
        seaborn_plot.set_ylabel ('y')
        seaborn_plot.set_zlabel ('z')
        plt.title('{}'.format(data_dist_key))
        #plt.savefig("unif_3d")
        plt.show ()


plot_obk_3d()

NameError: name 'data_dist_keys' is not defined

## Calculating Errors and Losses

#### Error Computing and the loss function

In [None]:
# Evaluation script.
def compute_error(true_value, predicted_value):
    return np.sum((true_value - predicted_value) ** 2)

def kmeans_loss(kmeans, training_data):
    # Initialize loss to zero.
    loss = 0
    # For every point in training data set
    for point in training_data:
    #    Find coordinates of closest cluster center.
    #    Compute squared distance between point and closest cluster center.
        distances =  np.sum( ( point[ : kmeans.d-1 ] - kmeans.cluster_means[ :, : kmeans.d-1] )**2, axis = 1 )
        #    Add distance to loss.
        loss += np.min(distances, axis = 0)
    return loss

### Computing the errors

In [None]:
def compute_losses_errors(
    k = 20,
    d = 3,
    alpha = 0.2,
    beta = 0.01,
    n_train = 10000,
    n_test = 1000,
    n_repeats = 1
    ):
    #inference_methods = ['higher_dimension', 'norm_weights','cluster_size', 'weights' ]
    for inference_method in inference_methods:
        errors = []
        for generate_data in data_generation_methods:
            distribution_errors = []
            for repeat in range(n_repeats):
                data = generate_data(n_train + n_test)
                training_data = data[:n_train]
                test_data = data[n_train:]
                assert len(training_data) == n_train
                assert len(test_data) == n_test
                #training_data = generate_data(n_train)
                #test_data = generate_data(n_test)
                kmeans = OnlineBalancedKmeans(k, d, alpha, beta, inference_method=inference_method.copy()) # TODO. Copy might not be necessary.
                for datum in training_data:
                    kmeans.assign_and_update(datum)

                for datum in test_data:
                    true_value = datum[-1]
                    predicted_value = kmeans.infer(datum)
                    error = compute_error(true_value, predicted_value)
                    distribution_errors.append(error)

            errors.append(distribution_errors)

            print('[{}, {}] \t Average error: {}'.format(generate_data.__name__, inference_method, np.mean(distribution_errors)))
        print('[{}] \t Average error: {}'.format(inference_method, np.mean(errors)))
    return errors

### Computing error updated codes

In [None]:
k = 20 # number of pseudo-clusters
d = 2 # number of features
alpha_ = 0.2 # Learning rate associated with the means
beta_ = 0.01 # Balances the size of the clusters
n_train = 1000
n_test = 100
n_repeats = 10

In [None]:
#data = generate_data_methods[data_dist_key](n_train + n_test)
inference_errors = {}
for inference_method in inference_methods:

    distribution_error_mean = {}
    # distributions_loss_mean = {}
    loss_vals = {}
    for data_dist_key in data_dist_keys:
        # distribution_errors = []
        #distribution_loss_repeat = {}

        data = generate_data_methods[data_dist_key](n_train + n_test)
        training_data = data[:n_train]
        test_data = data[n_train:]
        assert len(training_data) == n_train
        assert len(test_data) == n_test
        #training_data = generate_data(n_train)
        #test_data = generate_data(n_test)
        kmeans = OnlineBalancedKmeans(k, d, alpha = alpha_, beta = beta_, inference_method=inference_method.copy(), data_distribution=data_dist_key) # TODO. Copy might not be necessary.
        # loss_vals = []

        for datum in training_data:
            kmeans.assign_and_update(datum)
        cost = kmeans_loss(kmeans, training_data)
        loss_vals[data_dist_key]= round(cost, 4)
        #distribution_loss_repeat.append(loss_vals)
        #distribution_loss_mean = np.mean(distribution_loss_repeat, axis = 0)
        distribution_errors = []
        for repeat in range(n_repeats):
            for datum in test_data:
                true_value = datum[-1]
                predicted_value = kmeans.infer(datum)
                error = compute_error(true_value, predicted_value)
                distribution_errors.append(round(error, 4))
        distribution_error_mean[data_dist_key] = np.mean(distribution_errors)

        #distributions_error_mean.append(distribution_error_mean)  # dim = n_dist by n_test # Each row correspond to a distributions list of errors
        #distributions_loss_mean.append(distribution_loss_mean)  # dim = n_dist by n_train
        #print('[{},\t {}] \t Average error: {}'.format(data_dist_key, inference_method, round(np.mean(distributions_error_mean), 4)))
    # A three dimensional metric of the Inference errors
    inference_method_ = json.dumps(inference_method)
    inference_errors[inference_method_] = list(distribution_error_mean.values())
    print('[{}] \t Average error: {}'.format(inference_method, round(np.mean(list(distribution_error_mean.values())), 4) ))
final_error_data = inference_errors

NameError: name 'data_dist_keys' is not defined

### Visualising the various generated data

In [None]:
    # Viewing the first 12 plots
def graph_gen_data_a():
        rows = [0,1,2,3]*3
        cols = [0]*4 + [1]*4 + [2]*4
        #print( '{}\n{}'.format(cols,rows) )
        rcd = list(zip(rows,cols, data_dist_keys))
        # print('n_rows\tn_columns\tDistribution')
        # for r,c,d in rcd:
        #     print('{}\t{}\t{}'.format(r,c,d))
        k = 20
        d = 2
        # Create k-means clustering plots

        # Set the figure size

        fig,ax = plt.subplots(4,3,figsize = (15, 25))
        i = 0

        for r,c,data_dist_key in rcd:

            kmeans = OnlineBalancedKmeans(k, d, alpha=0.2, beta=0.01, inference_method={'method': 'weights', 'alpha': 0.0}, data_distribution=data_dist_key)
            # kmeans = OnlineBalancedKmeans(k=k, d=2, alpha=0.2, beta=0.2)
            # Create a dataset of 2D distributions
            X_train = generate_data_methods[data_dist_key](100)

            for index, data_point in enumerate(X_train):
                kmeans.assign_and_update(data_point)
            labels = kmeans.final_clusters(X_train)
            class_indices = kmeans.cluster_indices
            sns.scatterplot(x=[X[0] for X in X_train],
                            y=[X[1] for X in X_train],
                            hue=labels,
                            #style=true_labels,
                            palette="deep",
                            legend=None,
                            ax = ax[r,c]
                            )

            class_centers = kmeans.cluster_means
            class_counts = np.unique(labels, return_counts=True)[1]
            # class_counts = kmeans.cluster_counts
            X = [x for x, _ in class_centers]
            Y = [y for _, y in class_centers]
            ax[r,c].plot(X, Y, 'r+', markersize=10, )
            # add labels to all points
            for (xi, yi, counts) in zip(X, Y, class_counts):
                ax[r,c].text(xi, yi, counts, va='bottom', ha='center')
            ax[r,c].set_title('Final iteration {}'.format(data_dist_key))
            # i += 1
            plt.savefig('{}_final_iter.png'.format(data_dist_key))
            #plt.savefig('{}_final_iter.png'.format(data_dist_key))
            #plt.show()
        # plt.savefig('Data_vrs_OBKmeans.png')
        fig.tight_layout()

# graph_gen_data_a()

### Performance of the Hyperparameters

#### Hyperparameter k

In [None]:
# Generating range of values to loop over
kss = np.linspace(0,1000,11).tolist()
kss.pop(0)
kss.insert(0,10)
kss = [int(k) for k in kss]

In [None]:
# number of pseudo-clusters Type b
d = 2 # number of features
#alpha_ = 0.2 # Learning rate associated with the means
#beta_ = 0.01 # Balances the size of the clusters
n_train = 10000
n_test = 1000
n_repeats = 1
alpha = 0.2
beta = 0.01
n_ind = 1000

kdict_errors = {}
kdict_loss = {}
for k in kss:
        a = 1
        loss_vals = {}
        cost = {}
        error = {}
        kdict_errors[k] = []
        kdict_loss[k] = []
        for data_dist_key in data_dist_keys:
            # distribution_errors = []
            #distribution_loss_repeat = {}

            data = generate_data_methods[data_dist_key](n_train + n_test)
            training_data = data[:n_train]
            test_data = data[n_train:]
            assert len(training_data) == n_train
            assert len(test_data) == n_test
            #training_data = generate_data(n_train)
            #test_data = generate_data(n_test)
            kmeans = OnlineBalancedKmeans( k, d, alpha = alpha, beta = beta, inference_method={'method': 'weights', 'alpha': 0.4, 'beta':7 } , data_distribution=data_dist_key) # TODO. Copy might not be necessary.
            # Computing the losses and error and storing them in a dict
            inference_functions = [
                kmeans.infer_euclid_dist,
                kmeans.infer_norm_weights,
                kmeans.infer_cluster_size,
                kmeans.infer_weights,
                kmeans.infer_merge_norm_cl_size,
                kmeans.infer_merge_norm_ecl_dis,
                kmeans.infer_cluster_size_exp
            ]
            # Initializing losses
            cost[data_dist_key] = [ kmeans_loss(kmeans, training_data) ]
            # Initializing errors
            total_error = np.zeros_like(inference_functions)
            for datum in test_data:
                true_value = datum[-1]
                predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                # total_error dim = (n_inf)
            error[data_dist_key] = [total_error]
            for index, datum in enumerate(training_data):
                kmeans.assign_and_update(datum)
                if index%n_ind == 0:
                    # Compute losses after each 1000 assignments
                    cost[data_dist_key].append( kmeans_loss(kmeans, training_data) )
                    # Compute errors after each 1000 assignments
                        # Predicted values for all inference methods
                    predicted_values = [infer(datum) for infer in inference_functions ]
                    total_error = np.zeros_like(inference_functions)
                    for datum in test_data:
                        true_value = datum[-1]
                        predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                        total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                        # total_error dim = (n_inf)
                    error[data_dist_key].append( total_error )
        kdict_errors[k] = list(error.values())
        #mean_losses = np.mean(list(cost.values()), 0) # Values recorded for every 100 iteration
        kdict_loss[k] = list(cost.values())
        # mean_errors = np.mean(list(error.values()), 0) # Values recorded for every 100 iteration
        # kdict_errors[i] = mean_errors

NameError: name 'data_dist_keys' is not defined

#### Hyperparameter self.alpha

In [None]:
    # alpha vrs losses vrs errors
d = 2 # number of features
#alpha_ = 0.2 # Learning rate associated with the means
#beta_ = 0.01 # Balances the size of the clusters
k = 50
n_train = 10000
n_test = 1000
n_repeats = 1
#alpha = 0.2
beta = 0.01
alphas = np.linspace(0,1, 11 )
alpha_dict_loss = {}
alpha_dict_errors = {}
n_ind = 1000

for alpha in alphas:
        a = 1
        loss_vals = {}
        cost = {}
        error = {}
        alpha_dict_loss[alpha] = []
        alpha_dict_errors[alpha] = []
        for data_dist_key in data_dist_keys:
            # distribution_errors = []
            #distribution_loss_repeat = {}

            data = generate_data_methods[data_dist_key](n_train + n_test)
            training_data = data[:n_train]
            test_data = data[n_train:]
            assert len(training_data) == n_train
            assert len(test_data) == n_test
            #training_data = generate_data(n_train)
            #test_data = generate_data(n_test)
            kmeans = OnlineBalancedKmeans( k, d, alpha = alpha, beta = beta, inference_method={'method': 'weights', 'alpha': 0.4, 'beta':7 } , data_distribution=data_dist_key) # TODO. Copy might not be necessary.
            # Computing the losses and error and storing them in a dict
            inference_functions = [
                kmeans.infer_euclid_dist,
                kmeans.infer_norm_weights,
                kmeans.infer_cluster_size,
                kmeans.infer_weights,
                kmeans.infer_merge_norm_cl_size,
                kmeans.infer_merge_norm_ecl_dis,
                kmeans.infer_cluster_size_exp
            ]
            # Initializing losses
            cost[data_dist_key] = [ kmeans_loss(kmeans, training_data) ]
            # Initializing errors
            total_error = np.zeros_like(inference_functions)
            for datum in test_data:
                true_value = datum[-1]
                predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                # total_error dim = (n_inf)
            error[data_dist_key] = [total_error]
            for index, datum in enumerate(training_data):
                kmeans.assign_and_update(datum)
                if index%n_ind == 0:
                    # Compute losses after each 1000 assignments
                    cost[data_dist_key].append( kmeans_loss(kmeans, training_data) )
                    # Compute errors after each 1000 assignments
                        # Predicted values for all inference methods
                    predicted_values = [infer(datum) for infer in inference_functions ]
                    total_error = np.zeros_like(inference_functions)
                    for datum in test_data:
                        true_value = datum[-1]
                        predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                        total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                        # total_error dim = (n_inf)
                    error[data_dist_key].append( total_error )
        alpha_dict_errors[alpha].append(list(error.values()))
        #mean_losses = np.mean(list(cost.values()), 0) # Values recorded for every 100 iteration
        alpha_dict_loss[alpha].append(list(cost.values()))

#         mean_losses = np.mean(list(cost.values()), 0) # Values recorded for every 100 iteration
#         alpha_dict_loss[alpha].append(mean_losses)
#         mean_errors = np.mean(list(error.values()), 0) # Values recorded for every 100 iteration
#         alpha_dict_errors[alpha].append(mean_errors)

NameError: name 'data_dist_keys' is not defined

#### Hyperparameter self.beta

In [None]:
# beta vrs losses vrs errors
k =50
d = 2 # number of features
#alpha_ = 0.2 # Learning rate associated with the means
#beta_ = 0.01 # Balances the size of the clusters
n_train = 10000
n_test = 1000
n_repeats = 1
alpha = 0.2
# beta = 0.01
n_ind = 1000

betas = np.linspace(-0.5,1.5, 15)
beta_dict_loss = {}
beta_dict_errors = {}

for beta in betas:
        a = 1
        loss_vals = {}
        cost = {}
        error = {}
        beta_dict_loss[beta] = []
        beta_dict_errors[beta] = []
        for data_dist_key in data_dist_keys:
            # distribution_errors = []
            #distribution_loss_repeat = {}
            error[data_dist_key] = []

            data = generate_data_methods[data_dist_key](n_train + n_test)
            training_data = data[:n_train]
            test_data = data[n_train:]
            assert len(training_data) == n_train
            assert len(test_data) == n_test
            #training_data = generate_data(n_train)
            #test_data = generate_data(n_test)
            kmeans = OnlineBalancedKmeans( k, d, alpha = alpha, beta = beta, inference_method={'method': 'weights', 'alpha': 0.4, 'beta':7 } , data_distribution=data_dist_key) # TODO. Copy might not be necessary.
            # Computing the losses and error and storing them in a dict
            inference_functions = [
                kmeans.infer_euclid_dist,
                kmeans.infer_norm_weights,
                kmeans.infer_cluster_size,
                kmeans.infer_weights,
                kmeans.infer_merge_norm_cl_size,
                kmeans.infer_merge_norm_ecl_dis,
                kmeans.infer_cluster_size_exp
            ]
            # Initializing losses
            cost[data_dist_key] = [ kmeans_loss(kmeans, training_data) ]
            # Initializing errors
            total_error = np.zeros_like(inference_functions)
            for datum in test_data:
                true_value = datum[-1]
                predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                # total_error dim = (n_inf)
            error[data_dist_key].append(total_error.tolist())
            for index, datum in enumerate(training_data):
                kmeans.assign_and_update(datum)
                if index%n_ind == 0:
                    # Compute losses after each 1000 assignments
                    cost[data_dist_key].append( kmeans_loss(kmeans, training_data) )
                    # Compute errors after each 1000 assignments
                        # Predicted values for all inference methods
                    predicted_values = [infer(datum) for infer in inference_functions ]
                    total_error = np.zeros_like(inference_functions)
                    for datum in test_data:
                        true_value = datum[-1]
                        predicted_values = [infer(datum) for infer in inference_functions ] #dim = (n_inf,) #kmeans.infer(datum)
                        total_error += np.array([compute_error(true_value, pred_value) for pred_value in predicted_values])
                        # total_error dim = (n_inf)
                    error[data_dist_key].append( total_error.tolist() )
        beta_dict_errors[beta] = list(error.values())
        #mean_losses = np.mean(list(cost.values()), 0) # Values recorded for every 100 iteration
        beta_dict_loss[beta] = list(cost.values())

#         mean_losses = np.mean(list(cost.values()), 0) # Values recorded for every 100 iteration
#         beta_dict_loss[beta].append(mean_losses)
#         mean_errors = np.mean(list(error.values()), 0) # Values recorded for every 100 iteration
#         beta_dict_errors[beta].append(mean_errors)

NameError: name 'data_dist_keys' is not defined

## Working on the data

In [None]:
# Extracting the data into an array
list_ae = np.array(list(alpha_dict_errors.values()))
list_al = np.array(list(alpha_dict_loss.values()))
list_be = np.array(list(beta_dict_errors.values()))
list_bl = np.array(list(beta_dict_loss.values()))
list_ke = np.array(list(kdict_errors.values()))
list_kl = np.array(list(kdict_loss.values()))
lists = [list_ae, list_al, list_be, list_bl, list_ke, list_kl]

In [None]:
# alpha
infer_err_a = list_ae.mean(axis = 0).mean(axis = 0).mean(axis = 0).T # Avg over the alphas and the data
infer_err_a_b = list_ae.mean(axis = 1).mean(axis = 1)[6].T/1000 # best alpha is at index 6, alpha = 0.6

In [None]:
# beta
infer_err_b = list_be.mean(axis = 0).mean(axis = 0).T # Avg over the betas and the data
infer_err_b_b = list_be.mean(axis = 1)[4].T # best alpha is at index 6, alpha = 0.6

In [None]:
# ks
infer_err_k = list_be.mean(axis = 0).mean(axis = 0).T # Avg over the betas and the data
infer_err_k_b = list_be.mean(axis = 1)[2].T # best alpha is at index 6, alpha = 0.6

In [None]:
## Organising the parameters
dla_er = np.array(list(dict_la_er.values()))
dlb_er = np.array(list(dict_lb_er.values()))
# dlk_er = np.array(list(dict_lk_er.values()))
dict_hyp_ers = [dla_er, dlb_er, dlk_er]

alphas = np.linspace(0,1, 11 )
betas = np.linspace(-0.5,1.5, 15)
ks = np.array(list(range(10,1001,100)))
x = np.array(list(range(0, 10001, 1000)))
list_hyps = [alphas, betas, kss]

zip_h_d = zip(list_hyps, dict_hyp_ers)

### Sketching the graphs of alphas and their corresponding performance

In [None]:
## ## Sketching alphas and their corresponding performance
fig, (ax1,ax2) = plt.subplots(1, 2, figsize= (12, 5))

hyp_names = [r'$\alpha$',r'$\beta$','number of cluster']


for hyps, dict_hyp_er, hyp_name  in zip([list_hyps[0]], [dict_hyp_ers[0]],[hyp_names[0]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):

        if dict_hyp_er[inf_index,hyp_index,:][-1] < 10**8:
            y = dict_hyp_er[0,hyp_index,:].tolist()
            labels.append(round(hyp,2))
            #ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax2.plot(x, y)
            #ax2.fill_between(x, (y-ci), (y+ci), alpha=0.2)
            ax2.legend(labels,loc = 4, framealpha = 0.2)
        #plt.show()
        #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
        ax2.set_xlabel('Number of assignments')
        ax2.set_ylabel('Inference errors')
        ax2.set_title("Errors of the inference methods over {}'s".format(hyp_name))
        # fig.savefig('{} err.png'.format(hyp_name))

for hyps, dict_hyp_loss, hyp_name  in zip([list_hyps[0]], [dict_hyp_losses[0]],[hyp_names[0]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):
        if dict_hyp_loss[hyp_index,:][6] >= 100:
            labels.append(round(hyp,2))
            y = dict_hyp_loss[hyp_index,:]
            #ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax1.plot(x, y)
            #ax1.fill_between(x, (y-ci), (y+ci), alpha=0.1)
            ax1.legend(labels,loc = 1, framealpha = 0.2)
    #plt.show()
    #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
    ax1.set_xlabel('Number of assignments')
    ax1.set_ylabel('Losses')
    ax1.set_title("Errors of the inference methods over {}'s".format(hyp_name))
plt.savefig('{} loss_err.png'.format(hyp_name))

### Sketching the graphs of number of clusters and their corresponding performance

In [None]:
## Sketching number of clusters and their corresponding performance
fig, (ax1,ax2) = plt.subplots(1, 2, figsize= (12, 5))

hyp_names = ['alpha','beta','number of cluster']
for hyps, dict_hyp_loss, hyp_name  in zip([list_hyps[2]], [dict_hyp_losses[2]],[hyp_names[2]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):
        if dlk_ls[hyp_index,0] < 1*10**3:
            labels.append(round(hyp,2))
            y = dlk_ls[hyp_index,:]
            # ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax1.plot(x, y)
            # ax1.fill_between(x, (y-ci), (y+ci), alpha=0.1)
            ax1.legend(labels,loc = 1, framealpha = 0.2)
    #plt.show()
    #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
    ax1.set_xlabel('Number of assignments')
    ax1.set_ylabel('Losses')
    ax1.set_title('Losses of the inference methods over {}s'.format(hyp_name))
# plt.savefig('{} loss.png'.format(hyp_name))


for hyps, dict_hyp_er, hyp_name  in zip([list_hyps[2]], [dict_hyp_ers[2]],[hyp_names[2]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):
        if dlk_er[hyp_index,-1] < 3*10**2:
            labels.append(round(hyp,2))
            y = dlk_er[hyp_index, :].tolist()
            # ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax2.plot(x, y)
            # ax2.fill_between(x, (y-ci), (y+ci), alpha=0.1)
            ax2.legend(labels,loc = 2, framealpha = 0.2)
        #plt.show()
        #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
        ax2.set_xlabel('Number of assignments')
        ax2.set_ylabel('Inference errors')
        ax2.set_title('Errors of the inference methods over {}s'.format(hyp_name))
        #plt.savefig('{} err.png'.format(hyp_name))
plt.savefig('{} loss_err.png'.format(hyp_name))

### Sketching the graphs of Betas and their corresponding performance


In [None]:
## Sketching Beta's Errors and Losses
fig, (ax1,ax2) = plt.subplots(1, 2, figsize= (12, 5))

hyp_names = [r'$\alpha$',r'$\beta$','number of cluster']
for hyps, dict_hyp_loss, hyp_name  in zip([list_hyps[1]], [dict_hyp_losses[1]],[hyp_names[1]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):
        if dict_hyp_loss[hyp_index,:][6] < 600:
            labels.append(round(hyp,2))
            y = dict_hyp_loss[hyp_index,:]
            #ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax1.plot(x, y)
            #ax1.fill_between(x, (y-ci), (y+ci), alpha=0.1)
            ax1.legend(labels,loc = 1, framealpha = 0.2)
    #plt.show()
    #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
    ax1.set_xlabel('Number of assignments')
    ax1.set_ylabel('Losses')
    ax1.set_title('Losses of the inference methods over {}s'.format(hyp_name))
# plt.savefig('{} loss.png'.format(hyp_name))

for hyps, dict_hyp_er, hyp_name  in zip([list_hyps[1]], [dict_hyp_ers[1]], [hyp_names[1]]):
    labels = []
    for hyp_index, hyp in enumerate(hyps):

        if dict_hyp_er[inf_index,hyp_index,:][-1] < 6*10**7:
            labels.append(round(hyp,2))
            y = dict_hyp_er[0,hyp_index,:].tolist()
            #ci = 1.96 * np.std(y)/np.sqrt(len(x))

            ax2.plot(x, y)
            #ax2.fill_between(x, (y-ci), (y+ci), alpha=0.1)
            ax2.legend(labels,loc = 2, framealpha = 0.1)
        #plt.show()
        #print('{}'.format(dict_hyp_er[inf_index,hyp_index,:].shape))
        ax2.set_xlabel('Number of assignments')
        ax2.set_ylabel('Inference errors')
        ax2.set_title('Errors of the inference methods over {}s'.format(hyp_name))
        #ax2.set_savefig('{} err.png'.format(hyp_name))
plt.savefig('{} loss_err.png'.format(hyp_name))