In [17]:
class KMeans():
    def __init__(self, K, X=None, N=0):
        self.K = K
        if X == None:
            if N == 0:
                raise Exception("If no data is provided, a parameter N (number of points) is needed")
            else:
                self.N = N
                self.X = self._init_board_gauss(N, K)
        else:
            self.X = X
            self.N = len(X)
        self.mu = None
        self.clusters = None
        self.method = None
 
    def _init_board_gauss(self, N, k):
        n = float(N)/k
        X = []
        for i in range(k):
            c = (random.uniform(-1,1), random.uniform(-1,1))
            s = random.uniform(0.05,0.15)
            x = []
            while len(x) < n:
                a,b = np.array([np.random.normal(c[0],s),np.random.normal(c[1],s)])
                # Continue drawing points from the distribution in the range [-1,1]
                if abs(a) and abs(b)<1:
                    x.append([a,b])
            X.extend(x)
        X = np.array(X)[:N]
        return X
 
    def plot_board(self):
        X = self.X
        fig = plt.figure(figsize=(5,5))
        plt.xlim(-1,1)
        plt.ylim(-1,1)
        if self.mu and self.clusters:
            mu = self.mu
            clus = self.clusters
            K = self.K
            for m, clu in clus.items():
                cs = cm.spectral(1.*m/self.K)
                plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \
                         markersize=12, color=cs)
                plt.plot(zip(*clus[m])[0], zip(*clus[m])[1], '.', \
                         markersize=8, color=cs, alpha=0.5)
        else:
            plt.plot(zip(*X)[0], zip(*X)[1], '.', alpha=0.5)
        if self.method == '++':
            tit = 'K-means++'
        else:
            tit = 'K-means with random initialization'
        pars = 'N=%s, K=%s' % (str(self.N), str(self.K))
        plt.title('\n'.join([pars, tit]), fontsize=16)
        plt.savefig('kpp_N%s_K%s.png' % (str(self.N), str(self.K)), \
                    bbox_inches='tight', dpi=200)
 
    def _cluster_points(self):
        mu = self.mu
        clusters  = {}
        for x in self.X:
            bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
                             for i in enumerate(mu)], key=lambda t:t[1])[0]
            try:
                clusters[bestmukey].append(x)
            except KeyError:
                clusters[bestmukey] = [x]
        self.clusters = clusters
 
    def _reevaluate_centers(self):
        clusters = self.clusters
        newmu = []
        keys = sorted(self.clusters.keys())
        for k in keys:
            newmu.append(np.mean(clusters[k], axis = 0))
        self.mu = newmu
 
    def _has_converged(self):
        K = len(self.oldmu)
        return(set([tuple(a) for a in self.mu]) == \
               set([tuple(a) for a in self.oldmu])\
               and len(set([tuple(a) for a in self.mu])) == K)
 
    def find_centers(self, method='random'):
        self.method = method
        X = self.X
        K = self.K
        self.oldmu = random.sample(X, K)
        if method != '++':
            # Initialize to K random centers
            self.mu = random.sample(X, K)
        while not self._has_converged():
            self.oldmu = self.mu
            # Assign all points in X to clusters
            self._cluster_points()
            # Reevaluate centers
            self._reevaluate_centers()

In [19]:
kmeans = KMeans(10, N=200)
kmeans.find_centers()
#kmeans.plot_board()

# Algoritm

In [92]:
# Initiate environment

%matplotlib inline

import pandas as pd
#import DataFrame as df

import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

%matplotlib inline

In [93]:
# Constants

# Number of clusters
number_of_clusteroids = 10

# Columns to compare
columns_to_count_distance=[
    "region_client",
    "region_sc",
    "product_name",
    "product",
    "tarif_name",
    "price_item_name",
    "price_set_name",
    "month_length",
    "is_first",
    "Cnt",
]

# Clusteroid ID field
clusteroid_id_field = "id_org"
cluster_field = "Cluster"
cluster_distance_to_clusteroid_field = "Cluster_distance"


In [94]:
# Read the Data

# Todo, slice years

dateparser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

df_data = pd.read_csv("uc_kep_obs/pays.txt",
                 delimiter="\t", 
                 encoding="utf-8",
                 nrows=1000, 
                 parse_dates=[11,12], 
                 infer_datetime_format = True,
                 date_parser=dateparser
                )
# df
df_data = df_data[df_data['BDate'].dt.year == 2016]

In [95]:
# Add colunm for Cluster number

# it is not working
#cols_dtype={cluster_field : str,
#            cluster_distance_to_clusteroid_field : np.int}

#df_clusters_info = pd.DataFrame(dtype=cols_dtype)


df_data[cluster_field] = "00000000-0000-0000-0000-000000000000"
#D19CA103-3DFE-411C-B52A-32C12E6FDB4F"
df_data[cluster_distance_to_clusteroid_field] = np.nan

In [96]:
%matplotlib inline

In [97]:
# N random clusteroids

clusteroids = df_data.sample(number_of_clusteroids)[clusteroid_id_field]

clusteroids

994    346D703E-A03B-4BFC-ACB8-3A11026971BE
456    A0138F32-549B-4C82-904A-528A00493E68
963    ED62CB38-0B91-454E-A412-69C8349DFEE3
585    EDEE78E4-F9C6-4542-906F-EBD0CEAED3B5
539    135A38D7-CBC0-4E2E-954B-763C25417DB5
532    29C25271-355D-42FD-978C-E277430A0D5D
311    BEB451EE-4FF4-4B06-837E-C765771D1A43
909    1C465DD7-4E28-46C0-B59D-49F08AE56298
96     403FE9D6-C88C-4115-832F-7157E63353A8
910    4F2D604A-5F3C-465A-B277-1261EF2201CC
Name: id_org, dtype: object

In [98]:
df_data.describe()

Unnamed: 0,region_client,region_sc,month_length,cost,tip,BaseCost,is_first,Cnt,amount,Cluster_distance
count,317.0,317.0,238.0,317.0,317.0,317.0,317.0,317.0,317.0,0.0
mean,5.955836,18.359621,12.02521,2012.435552,4.252366,2181.926763,0.921136,1.006309,1903.044195,
std,14.168447,29.653004,1.57492,1587.080918,3.518401,1686.842274,0.269953,0.079304,1700.838811,
min,1.0,1.0,1.0,60.0,1.0,60.0,0.0,1.0,34.2857,
25%,1.0,1.0,12.0,1000.0,2.0,1000.0,1.0,1.0,871.4286,
50%,1.0,1.0,12.0,1086.5,3.0,1300.0,1.0,1.0,1000.0,
75%,1.0,23.0,12.0,3400.0,4.0,3600.0,1.0,1.0,3120.0,
max,99.0,99.0,15.0,10000.0,11.0,10000.0,1.0,2.0,14000.0,


In [99]:
log_value =[]

for point in df_data.index: # Iterete through data to clusterise
    for clusteroid in clusteroids.index: # iterate throught clusters
        log_value.append(clusteroid)
#log_value

In [100]:
# Assign to clusters

# Todo: Add converging, isChanged
log_value =[]

for point in df_data.index: # Iterete through data to clusterise
    min_distance = len(columns_to_count_distance)
    min_clusteroid = np.nan
    for clusteroid in clusteroids.index: # iterate throught clusters
        row1 = df_data.loc[point][columns_to_count_distance]
        row2 = df_data.loc[clusteroid][columns_to_count_distance]
        unique_columns = row1 == row2
        lenght = len(columns_to_count_distance)
        unique_columns_sum = unique_columns.sum()
        distance = lenght - unique_columns_sum
        if distance < min_distance:
            min_distance = distance
            min_clusteroid = df_data.loc[clusteroid][clusteroid_id_field]
            log_value.append("new min distance {} clusteroid {}".format(min_distance,min_clusteroid))
    df_data.set_value(point,cluster_field,min_clusteroid)
    df_data.set_value(point,cluster_distance_to_clusteroid_field,min_distance)

log_value

['new min distance 4 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 0 clusteroid ED62CB38-0B91-454E-A412-69C8349DFEE3',
 'new min distance 3 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 4 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 2 clusteroid ED62CB38-0B91-454E-A412-69C8349DFEE3',
 'new min distance 0 clusteroid BEB451EE-4FF4-4B06-837E-C765771D1A43',
 'new min distance 4 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 1 clusteroid A0138F32-549B-4C82-904A-528A00493E68',
 'new min distance 5 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 2 clusteroid ED62CB38-0B91-454E-A412-69C8349DFEE3',
 'new min distance 3 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new min distance 2 clusteroid A0138F32-549B-4C82-904A-528A00493E68',
 'new min distance 0 clusteroid 403FE9D6-C88C-4115-832F-7157E63353A8',
 'new min distance 3 clusteroid 346D703E-A03B-4BFC-ACB8-3A11026971BE',
 'new 

In [77]:
df_data

Unnamed: 0,id_org,region_client,id_number,vcuID,region_sc,product_name,product,tarif_name,price_item_name,price_set_name,...,BDate,PayDate,cost,tip,BaseCost,is_first,Cnt,amount,Cluster,Cluster_distance
1,EB298F52-2EF5-47CA-B214-BF3021FBDE84,1,740BAECE-9FE6-45FF-8378-1890847685F3,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,Сертум,Услуги УЦ,Сертум.Классик,Право использования программ для ЭВМ для управ...,Прайс Billy,...,2016-10-18,2016-10-21,4000.00,4,4000.0000,1,1.0,4000.0000,82EC9C02-9EC6-445F-8BFC-C0C018610B4A,2.0
3,9EE3648C-9E40-4D18-BF9E-DB908E4257DD,1,70C6BC1B-4257-4746-A1EF-41E4C9C22DB0,CC1EC32C-CE02-46AD-A1C0-9BACA220C8C5,23,КЭП,КЭП,ГИС ЖКХ (КС2) на носителе,Право использования программ для ЭВМ для управ...,Прайс Billy,...,2016-11-08,2016-11-10,3120.00,4,3120.0000,1,1.0,3120.0000,7C930F72-9B3D-417E-902C-CF5ACAE407DC,3.0
8,24252267-0935-4BE5-B07D-E9E8FD65271B,1,A1C52902-53BD-477B-BCD3-322AC8F75694,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,Сертум,Услуги УЦ,ЭП 2.0,"Абонентское обслуживание по тарифному плану ""Э...",Прайс Billy,...,2016-07-12,2016-07-13,1300.00,3,1300.0000,1,1.0,1300.0000,82EC9C02-9EC6-445F-8BFC-C0C018610B4A,1.0
10,85E1E31A-832A-4712-94D9-E7C919C217C7,1,22383A84-CAA2-44F6-B425-79A61F6FC0CF,420E16BF-3EBE-491A-997E-5B078E6CB3CC,23,КЭП для ЕГАИС,КЭП,КЭП.ЕГАИС на 12 мес.,"Абонентское обслуживание по тарифному плану ""К...",Прайс Billy,...,2016-06-02,2016-06-03,720.00,1,900.0000,1,1.0,720.0000,A0138F32-549B-4C82-904A-528A00493E68,1.0
11,658738CC-E20B-4E3B-BF6A-D699AB65C77D,1,F6B6DF8B-7CEA-4ADB-BF31-7405A2A80FD3,F8AD9021-B069-44F3-81A7-A05B93AED572,23,Сертум,Услуги УЦ,Сертум.Классик,Право использования программ для ЭВМ для управ...,Прайс Billy,...,2016-06-09,2016-06-10,4400.00,2,4400.0000,1,1.0,4400.0000,658738CC-E20B-4E3B-BF6A-D699AB65C77D,1.0
12,AE652D29-8A07-4091-AE3C-FEE40E83977D,1,ADF6469C-D3A0-4F7B-A13F-77FB2A353B02,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,КЭП для ЕГАИС,КЭП,КЭП.ЕГАИС на 12 мес.,"Абонентское обслуживание по тарифному плану ""К...",Прайс Billy,...,2016-06-30,2016-07-01,900.00,1,900.0000,1,1.0,900.0000,A0138F32-549B-4C82-904A-528A00493E68,2.0
13,BF70332F-A547-426E-8025-4EAA7ED5B6F0,1,64D096CF-CD72-4689-B7FF-19DA41FD76E6,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,КЭП для ЕГАИС,КЭП,КЭП.ЕГАИС на 12 мес.,"Абонентское обслуживание по тарифному плану ""К...",Прайс Billy,...,2016-05-23,2016-05-24,900.00,1,900.0000,1,1.0,900.0000,A0138F32-549B-4C82-904A-528A00493E68,2.0
15,C264CDAC-19F9-4E85-8BBA-257C4CDD6CB7,1,99366EC9-9D91-4011-9C36-E06499FA1639,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,Сертум,Услуги УЦ,Сертум.Классик,"Абонентское обслуживание по тарифному плану ""С...",Прайс Billy,...,2016-07-13,2016-07-13,1000.00,3,1000.0000,1,1.0,1000.0000,82EC9C02-9EC6-445F-8BFC-C0C018610B4A,2.0
16,737F107E-E6AF-4BD4-B1B8-5D14BABFCCD9,1,4E0D6CD0-B03B-4DE0-9D1B-296D5232D994,C8F7BB9B-EE44-46A8-83C4-B88DE169F8CD,1,Сертум,Услуги УЦ,ЭП 2.0,"Абонентское обслуживание по тарифному плану ""Э...",Прайс Billy,...,2016-11-14,2016-12-05,1300.00,3,1300.0000,1,1.0,1040.0000,82EC9C02-9EC6-445F-8BFC-C0C018610B4A,1.0
18,84A2098A-E52A-4899-859F-AA003FFC63DE,1,9502C2B9-9020-4482-85A0-8FA8ECAA0D9E,98274BF4-5ACE-4C17-86A2-2B31AE03C07B,1,КЭП,КЭП,ГИС ЖКХ (КС2) на носителе,Право использования программ для ЭВМ для управ...,Прайс Billy,...,2016-11-26,2016-11-30,1944.00,2,2777.1429,1,1.0,1944.0000,7C930F72-9B3D-417E-902C-CF5ACAE407DC,2.0


In [78]:
clusteroids

22     7C930F72-9B3D-417E-902C-CF5ACAE407DC
556    82EC9C02-9EC6-445F-8BFC-C0C018610B4A
449    D599C610-7452-41E7-9850-74018755CB19
451    658738CC-E20B-4E3B-BF6A-D699AB65C77D
21     3EDBC49E-13F9-4417-854B-5487CB61E9DE
629    4BCA2E0F-D926-422A-AD70-BE080E95375D
456    A0138F32-549B-4C82-904A-528A00493E68
181    7C930F72-9B3D-417E-902C-CF5ACAE407DC
562    135A38D7-CBC0-4E2E-954B-763C25417DB5
474    2D0A6718-C626-4D52-9A2F-A20890C2A78A
Name: id_org, dtype: object

In [79]:
clusteroids.index #.loc[56]

Int64Index([22, 556, 449, 451, 21, 629, 456, 181, 562, 474], dtype='int64')

In [80]:
# Recalculate clusteriods

# Get old clusteroid, for each element in the cluster calculate sum of distances to all the other points in cluster. 
# Keep Min -> save as new clusteroid

log_value =[]

old_clusteroids = clusteroids
print "Old Clusters {}".format(old_clusteroids)
print "Old Clusters {}".format(old_clusteroids)

clusteroids = np.nan

for clusteroid in old_clusteroids: # Iterete through to clusteroids
    print ("Cluster {}".format(clusteroid))
    min_distance = len(columns_to_count_distance)
    min_clusteroid = np.nan
    for element_i in df_data[df_data[cluster_field] == clusteroid].index: # iterate throught cluster
        print ("--Element_i {}".format(element_i))
        sum_distance = len(columns_to_count_distance)
        sum_clusteroid = np.nan
        for element_j in df_data[df_data[cluster_field] == clusteroid].index: # iterate throught cluster
            print ("----Element_j {}".format(element_j))
            row1 = df_data.loc[element_i][columns_to_count_distance]
            row2 = df_data.loc[element_j][columns_to_count_distance]
            unique_columns = row1 == row2
            lenght = len(columns_to_count_distance)
            unique_columns_sum = unique_columns.sum()
            distance = lenght - unique_columns_sum
            sum_distance = sum_distance + distance
        if sum_distance < min_distance:
            min_distance = sum_distance
            min_clusteroid = element_i
            log_value.append("new clusteroid {} min summ distance {}".format(element_i,sum_distance))
            print ("----New clusteroid {} min summ distance {}".format(element_i,sum_distance))
    #clusteroids.append(min_clusteroid)
    if min_clusteroid != np.nan:
        #df_data = df_data[df_data['BDate'].dt.year == 2016]
        clusteroids.append(df_data[df_data[clusteroid_id_field] == min_clusteroid]["id_org"].head)
        print ("Clusteroid updated {} by {}".format(clusteroid,min_clusteroid))

log_value

AttributeError: 'float' object has no attribute 'append'

In [49]:
#df_data.sort(cluster_field, ascending=True)
#df.sort('Bytes', ascending=False)
df_data[cluster_field].unique()

array([u'C42CD9EB-5C5E-472B-9ECF-0EA5F4EBD438',
       u'9EE3648C-9E40-4D18-BF9E-DB908E4257DD',
       u'24252267-0935-4BE5-B07D-E9E8FD65271B',
       u'BE529213-28C8-498A-9C46-6CE834B136BB',
       u'912D156E-FAB5-4DEE-A39C-43437D5AC45D',
       u'D2B8D42C-26E4-44A4-A73A-23F0C217845D',
       u'2D0A6718-C626-4D52-9A2F-A20890C2A78A',
       u'53B6D2C2-C13B-4D18-8C97-BDBD2BBC9898',
       u'9EF648A2-363B-4D85-AF81-3EC1423EF8C6',
       u'135A38D7-CBC0-4E2E-954B-763C25417DB5'], dtype=object)

In [51]:
clusteroids

844    BE529213-28C8-498A-9C46-6CE834B136BB
576    53B6D2C2-C13B-4D18-8C97-BDBD2BBC9898
808    2D0A6718-C626-4D52-9A2F-A20890C2A78A
679    9EE3648C-9E40-4D18-BF9E-DB908E4257DD
260    C42CD9EB-5C5E-472B-9ECF-0EA5F4EBD438
423    912D156E-FAB5-4DEE-A39C-43437D5AC45D
856    D2B8D42C-26E4-44A4-A73A-23F0C217845D
946    9EF648A2-363B-4D85-AF81-3EC1423EF8C6
562    135A38D7-CBC0-4E2E-954B-763C25417DB5
76     24252267-0935-4BE5-B07D-E9E8FD65271B
Name: id_org, dtype: object

In [91]:
clusteroid = "DE7EBBBD-A930-4189-A9DD-E04BA04466FF"
# df_data[df_data[cluster_field] == clusteroid].index
min_clusteroid = "9EF648A2-363B-4D85-AF81-3EC1423EF8C6"
df_data[df_data[clusteroid_id_field] == min_clusteroid]["id_org"].loc[]

SyntaxError: invalid syntax (<ipython-input-91-ba1b2f6e143a>, line 4)

In [81]:
df_data[][columns_to_count_distance]

region_client                                                      1
region_sc                                                          1
product_name                                                     КЭП
product                                                          КЭП
tarif_name                                               КЭП.Классик
price_item_name    Абонентское обслуживание по тарифному плану "К...
price_set_name                                           Прайс Billy
month_length                                                      12
is_first                                                           1
Cnt                                                                1
Name: 994, dtype: object