In [1]:
#libaraies
import uuid
import copy
import math
import numpy as np
from scipy.stats import norm

# used for snapshot intervals
alpha = 2 

# used to increase effectiveness of time horizon
l = 2

# dimensions of data
d = 2 

# max clusters stored 
q = 10 

# initial time
time = 1 

# max duration of running stream
maxtime = 100  

# clusters dict, cluster[id] = cluster object 
clusters = {}

# user defined number, helps in calculating least time stamp. i.e from last m timestamps
m = 2

# boundary when a cluster has only one data point 
fixed_boundary = 5

# user defined number, for
delta = 2

# user defined number, multipled with rms of cluster to get boundary of the cluster
T = 5

# timestamps, ss[i] denotes a list of time of order i
ss = []

# clusters saved at times, ss_clusters[i] = clusters dict at time i
ss_clusters = {}

#randomnly creating an array o x and y corrodinate
def online_data(): 
    
    x = np.random.randint(10,70,(1,d))  
    # convert to np.float32 
    x = np.float32(x) 
    return x[0] 

# cluster class
class cluster:
    
    # has unique id and 2d+3 elements
    def __init__(self):
        self.id = uuid.uuid1()
        self.CF2x = [0]*d
        self.CF1x = [0]*d
        self.CF2t = 0
        self.CF1t = 0
        self.n = 0
        
    # to add a data point in cluster, uses additivity property
    def add_point(self,val,time):
        self.n += 1
        for i in range(d):
            self.CF2x[i] += (val[i])**2
            self.CF1x[i] += (val[i])
        self.CF2t += time**2
        self.CF1t += time
    
    # delete a cluster, bascially remove it from the mainted clusters
    def delete(self):
        del clusters[self.id]
    
    # merge to a cluster, all its data is added to another one, additivity
    def mergeto(self,c):
        for i in range(d):
            c.CF2x[i] += self.CF2x[i]
            c.CF1x[i] += self.CF1x[i]
        c.CF1t += self.CF1t
        c.CF2t += self.CF2t
        c.n += self.n
    
    # remove to a cluster, used to undo mergeto operation in queries as they are independent
    def removeto(self,c):
        for i in range(d):
            c.CF2x[i] -= self.CF2x[i]
            c.CF1x[i] -= self.CF1x[i]
        c.CF1t -= self.CF1t
        c.CF2t -= self.CF2t
        c.n -= self.n
        
    # to find the centroid of the cluster with n data points
    def centroid(self):
        if(self.n==0):
            return 0
        arr = []
        for i in range(d):
            arr.append(self.CF1x[i]/self.n)
        return arr
    
    # to find the timestamps centroid with n timestamps
    def centroid_time(self):
        if(self.n==0):
            return 0
        arr= self.CF1t/self.n
        return arr
    
    # to find mean, just call centroid
    def mean(self):
        return self.centroid()
    
    # to find mean_time, just call centroid time
    def mean_time(self):
        return self.centroid_time()

    # standard deviation of the data points of the cluster
    def standard_deviation(self):
        if(self.n==0):
            return zero
        if(self.n==1):
            return fixed_boundary
        centroid = self.centroid()
        sum = 0
        for i in range(d):
            sum += self.CF2x[i]-2.0*centroid[i]*self.CF1x[i]+self.n*(centroid[i]**2)
        sd = math.sqrt(sum/self.n)
        return sd
    
    # standard deviation of the timestamps of the cluster
    def standard_deviation_time(self):
        if(self.n==0):
            return zero
        centroid = self.centroid_time()
        sum1 = 0
        sum1 = self.CF2t-2.0*centroid*self.CF1t+self.n*(centroid**2)
        sd = math.sqrt(sum1/self.n)
        return sd
    
# squaring of two points   
def dist(x,y):
    return (x-y)**2

# finds the euclidean distance of 2 vectors of d dimensions,
def dist_v(x,y):
    sum = 0
    for i in range(d):
        sum += dist(x[i],y[i])
    sum = math.sqrt(sum/d)
    return sum

# finds the nearest cluster object, in case of multiple, outputs last
def find_nearest_cluster(x):
    
    mindist = math.inf
    cluster_object = ""
    
    for id in clusters:
        c = clusters[id]
        centroid = c.centroid()
        sum = dist_v(centroid,x)
        mindist = min(mindist,sum)
        if(mindist==sum):
            cluster_object = c
    return cluster_object

# finds the cluster object with least timestamp among time of arrivals of (m/2n)th percentile of points
def least_timestamp():
    
    minstamp = math.inf
    cluster_object = ""
    
    for id in clusters:
        c = clusters[id]
        if(c.n<2*m):
            stamp = c.mean_time()
        else:
            stamp = norm.ppf(m/(2.0*c.n))*c.standard_deviation_time()+c.mean_time()
        minstamp = min(minstamp,stamp)
        if(stamp == minstamp):
            cluster_object = c
            
    return [cluster_object,minstamp]

# finds the max boundry
def find_max_boundry(id):
    
    c = clusters[id]
    return T*c.standard_deviation()

# merges two clusters which are nearest
def merge_nearest():
    
    c1 = c2 = ""
    mindist = math.inf
    
    for id in clusters:
        for id1 in clusters:
            if(id==id1):
                continue
            distance = dist_v(clusters[id].centroid(),clusters[id1].centroid())
            mindist = min(distance,mindist)
            if(mindist==distance):
                c1 = clusters[id]
                c2 = clusters[id1]
    
    # merges with the cluster with bigger points
    if(c1.n > c2.n):
        c2.mergeto(c1)
        c2.delete()
    else:
        c1.mergeto(c2)
        c1.delete()
        
# saves all the clusters at one instance of time, pyramidal time frame
def snapshot():
        
    order = math.floor(math.log(time,2))
    for i in range(order,-1,-1):
        if(time%pow(2,i)==0):
            if(len(ss)<=i):
                ss.append([])
            ss[i].append(time)
            ss_clusters[time] = copy.deepcopy(clusters)
            if(len(ss[i]) > pow(alpha,l)+1):
                x = ss[i].pop(0)
                del ss_clusters[x]
            break

# finds a element which is just less= than the operational element
def just_smallest(temp_time):
    
    maxtime = 0
    for time_itr in ss_clusters:
        if(time_itr <= temp_time):
            maxtime = max(maxtime,time_itr)
        
    return maxtime

# finds a element which is just greater= than the operational element
def just_biggest(temp_time):
    
    mintime = math.inf
    for time_itr in ss_clusters:
        if(time_itr >= temp_time):
            mintime = min(mintime,time_itr)
            
    return mintime

# finds a single centroid with given centroids and thier frequencies in d dimension
def weighted_centroid(weighted_arr):
    
    sum = num_points = 0
    mean = [0]*d
    for pair in weighted_arr:
        num_points += pair[1]
    
    for i in range(d):
        sum = 0
        for pair in weighted_arr:
            sum += pair[0][i]*pair[1]
        sum /= num_points
        mean[i] = sum
            
    return [mean,num_points]

# shows the result of the program
def show_result(weighted_list,cluster_list):
    
    for i in range(len(weighted_list)):
        for j in range(len(weighted_list[i][0])):
            weighted_list[i][0][j] = round(weighted_list[i][0][j],2)
            
    for i in range(len(cluster_list)):
        for j in range(len(cluster_list[i][0])):
            cluster_list[i][0][j] = round(cluster_list[i][0][j],2)
    
    print("Clusters in the range:")
    for i in range(len(weighted_list)):
        print("Cluster "+str(i+1),"  Cluster Center(d dimension):  ", str(weighted_list[i][0]),"  Data points: ", str(weighted_list[i][1]))
    print("\nFinal K clusters in range:")
    for i in range(len(cluster_list)):
        print("Final Cluster "+str(i+1),"  Cluster Center(d dimension):  ", str(cluster_list[i][0]),"  Data points: ", str(cluster_list[i][1]))
        
# finds k clusters among the clusters in the range query
def weighted_kmeans(k,weighted_list):
    
    itr = 100
    cluster_list = []
    for i in range(k):
        temp_list = copy.deepcopy(weighted_list[i])
        cluster_list.append(temp_list)
    
    while(itr > 0):
        
        itr -= 1
        for i in range(k):
            cluster_list[i][1] = 0
            
        weighted_arr = [[] for i in range(k)]
        
        for i in range(len(weighted_list)):
            mindist = math.inf
            ind = 0
            for j in range(k):
                distance = dist_v(weighted_list[i][0],cluster_list[j][0])
                mindist = min(mindist,distance)
                if(mindist==distance):
                    ind = j
            weighted_arr[ind].append(weighted_list[i])
            
        for i in range(k):
            cluster_list[i] = weighted_centroid(weighted_arr[i])
            
    show_result(weighted_list,cluster_list)

# query, k = desired clusters, t1 = timeframe 1, t2 = timeframe 2
def query(k,t1,t2):
    
    if(t2 < t1):
        print("t2 must br greater than t1!")
        return
    if(k > q):
        print("clusters must be less than q")
        return
    
    t1 = just_smallest(t1)
    
    if(t1==0):
        print("t1 too low!")
        return  
        
    t2 = just_biggest(t2)
    
    if(t2==math.inf):
        print("t2 too high!")
        return
    
    # use the subtractive properites of cluster
    for id in ss_clusters[t2]:
        if(id in ss_clusters[t1]):
            ss_clusters[t1][id].removeto(ss_clusters[t2][id])
    
    weighted_list = []
    for id in ss_clusters[t2]:
        temp_list = []
        temp_list.append(ss_clusters[t2][id].centroid())
        temp_list.append(ss_clusters[t2][id].n)
        if(temp_list[1]==0):
            continue
        weighted_list.append(temp_list)
    
    if(len(weighted_list) < k):
        print("Less than K clusters! Select more wider range")
        return
    
    weighted_kmeans(k,weighted_list)
    
    # add those removed values again, for next queries
    for id in ss_clusters[t2]:
        if(id in ss_clusters[t1]):
            ss_clusters[t1][id].mergeto(ss_clusters[t2][id])
        
# simply, making a new cluster for each data point till q
def init_with_kmeans():  
    
    global time
    
    while(time < maxtime and len(clusters)  < q):

        x = online_data()
        c = cluster()
        c.add_point(x,time)
        clusters[c.id] = c

        snapshot()
        time += 1

# main online algorithm 
def clustream():
    
    global time
    
    while(time < maxtime):

        # gets the online data
        x = online_data()
        # finds the nearest cluster
        M = find_nearest_cluster(x)
        # finds the boundary point of the nearest cluster
        b = find_max_boundry(M.id)
        
        # check if data point lies inside the boundary
        if(dist_v(x,M.centroid())<=b):
            # add the point to the cluster
            M.add_point(x,time)
        else:
            # create new cluster and add data point and add it to clusters(dict)
            c = cluster()
            clusters[c.id] = c
            c.add_point(x,time)
            # calculate least timestamp cluster
            cl = least_timestamp()
            
            # timestamp condition
            if(cl[1] < delta):
                # delete the least relevent cluster
                cl[0].delete()
            else:
                # merge two nearest clusters
                merge_nearest()
        
        # save the clusters
        snapshot()
        time += 1
        
        
init_with_kmeans()

clustream()
    
# k,t1,t2
query(3,50,99)


Clusters in the range:
Cluster 1   Cluster Center(d dimension):   [67.0, 40.0]   Data points:  2
Cluster 2   Cluster Center(d dimension):   [46.67, 53.67]   Data points:  3
Cluster 3   Cluster Center(d dimension):   [43.0, 35.5]   Data points:  4
Cluster 4   Cluster Center(d dimension):   [15.0, 57.25]   Data points:  4
Cluster 5   Cluster Center(d dimension):   [56.9, 19.1]   Data points:  10
Cluster 6   Cluster Center(d dimension):   [31.67, 59.44]   Data points:  9
Cluster 7   Cluster Center(d dimension):   [24.75, 16.5]   Data points:  4
Cluster 8   Cluster Center(d dimension):   [20.86, 36.71]   Data points:  7
Cluster 9   Cluster Center(d dimension):   [63.8, 58.3]   Data points:  10
Cluster 10   Cluster Center(d dimension):   [49.0, 69.0]   Data points:  1

Final K clusters in range:
Final Cluster 1   Cluster Center(d dimension):   [64.33, 55.25]   Data points:  12
Final Cluster 2   Cluster Center(d dimension):   [31.41, 58.47]   Data points:  17
Final Cluster 3   Cluster Center