In [41]:
import numpy as np
import pandas as pd
from math import cos, sin, acos
import matplotlib.pyplot as plt

In [53]:
path = './data/everyday_bikeloc/day1.csv'
data = pd.read_csv(path)
location = data['0']
x = location.map(lambda x:float(x.split(',')[0]))
y = location.map(lambda x:float(x.split(',')[1]))
new_data = pd.concat([y,x],axis=1)
new_data.columns=['y','x']
new_data

Unnamed: 0,y,x
0,31.389,121.348
1,31.279,121.508
2,31.254,121.383
3,31.320,121.484
4,31.292,121.407
...,...,...
79057,31.119,121.372
79058,31.201,121.422
79059,31.193,121.473
79060,31.307,121.391


In [51]:
# 加载数据
def loaddATEsET(filename,index):
    data = pd.read_csv(filename)
    x = location.map(lambda x:float(x.split(',')[0]))
    y = location.map(lambda x:float(x.split(',')[1]))
    dataArr = pd.concat([y,x],axis=1)
    dataArr.columns=['y','x']
    dataArr = dataArr.values
    return np.mat(dataArr)

In [44]:
# 计算根据经纬度计算两点之间的球面距离
def distSLC(vacA, vacB):
    a = sin(vacA[0, 1]*np.pi/180) * sin(vacB[0, 1]*np.pi/180)
    b = cos(vacA[0, 1]*np.pi/180) * cos(vacB[0, 1]*np.pi/180)*cos(np.pi*(vacB[0, 0]-vacA[0, 0])/180)
    return acos(a+b)*6371.0

In [45]:
# 随机初始化K个质心(质心满足数据边界之内)
def randCent(dataSet, k):
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
        minJ = np.min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j])-minJ)
        centroids[:, j] = minJ+rangeJ*np.random.rand(k, 1)
    return centroids

In [46]:
# Kmeans聚类算法
def kMeans(dataSet, k, distMeas=distSLC, createCent=randCent):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j, :], dataSet[i, :])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True
            clusterAssment[i, :] = minIndex, minDist**2
        # print(centroids)
        for cent in range(k):
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
            if len(ptsInClust) != 0:
                centroids[cent, :] = np.mean(ptsInClust, axis=0)
    return centroids, clusterAssment

In [47]:
# 计算Kmeans聚类算法的SSE值
def calcSSE(clusterAssment):
    return sum(clusterAssment[:, -1])

In [48]:
def plotKMeans(dataSet, clusterAssment, centroids, k):
    fig = plt.figure()
    rect = [0.1, 0.1, 0.8, 0.8]
    scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0 = fig.add_axes(rect, label='ax0', **axprops)
#     imgP = plt.imread('1.png')
#     ax0.imshow(imgP)
    ax1 = fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(k):
        ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == i)[0], :]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0],
                    ptsInCurrCluster[:, 1].flatten().A[0],
                    marker=markerStyle, s=90)
    ax1.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0],
                marker='+', s=300)
    plt.show()
    # plt.savefig('fig.png', bbox_inches='tight')

In [54]:
dataSet = loaddATEsET(path,0)
for i in range(40):
    centroids, clusterAssment = kMeans(dataSet, i+2)
    sseValue = calcSSE(clusterAssment)
    print("当k值为：%d时，对应的SSE值为：%f" % (i+2, sseValue))
centroids, clusterAssment = kMeans(dataSet, 6)
plotKMeans(dataSet, clusterAssment, centroids, 6)

当k值为：2时，对应的SSE值为：2106897.108432
当k值为：3时，对应的SSE值为：1469045.424411
当k值为：4时，对应的SSE值为：1161234.508861


KeyboardInterrupt: 