# 聚类算法

根据数据特征把相似的数据聚集到一起。

## K均值（k-means）
sklearn中使用的是k-means++的改进算法

In [3]:
import pandas as pd

df=pd.read_csv('trans9697.csv')
df

Unnamed: 0,交易编号,账户编号,交易日期,交易类型,金额,账户余额
0,732811,2504,1996-01-01 19:42:29,信用卡取款,900.0,38124.4
1,800209,2729,1996-01-01 05:21:13,取款,1920.0,34202.7
2,803553,2738,1996-01-01 13:13:33,取款,6500.0,25685.2
3,1042686,3566,1996-01-01 15:15:43,取款,1000.0,25919.7
4,1042689,3566,1996-01-01 02:27:26,取款,500.0,25419.7
...,...,...,...,...,...,...
481183,520031,1775,1997-12-31 09:49:15,取款,14.6,23531.3
481184,517312,1767,1997-12-31 12:02:46,取款,14.6,49026.8
481185,518251,1770,1997-12-31 16:24:24,取款,14.6,70109.9
481186,519067,1772,1997-12-31 00:11:30,取款,30.0,104802.8


In [13]:
df2 = df.groupby(['账户编号']).aggregate(
    {'交易类型':len,'金额':sum,'账户余额':max})
df2


Unnamed: 0_level_0,交易类型,金额,账户余额
账户编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,202059.5,20525.5
2,169,1125788.3,69302.0
3,25,89529.9,30073.7
4,117,229254.1,30623.5
5,18,44340.7,29220.1
...,...,...,...
11333,177,1915318.6,130693.8
11349,181,2214876.1,113678.7
11359,192,1413812.4,68376.4
11362,220,858293.2,56055.6


In [14]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

ms = MinMaxScaler()

df3 = ms.fit_transform(df2)

model = KMeans(n_clusters=4)
model.fit(df3)

KMeans(n_clusters=4)

In [11]:
#四个组别的中心坐标
model.cluster_centers_

array([[0.48302553, 0.08844105, 0.18538361],
       [0.52454577, 0.25218124, 0.39192452],
       [0.13054579, 0.04723222, 0.22070139],
       [0.60528888, 0.52850102, 0.64617273]])

In [15]:
df2['分组结果'] = model.predict(df3)
df2

Unnamed: 0_level_0,交易类型,金额,账户余额,分组结果
账户编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,138,202059.5,20525.5,0
2,169,1125788.3,69302.0,1
3,25,89529.9,30073.7,2
4,117,229254.1,30623.5,0
5,18,44340.7,29220.1,2
...,...,...,...,...
11333,177,1915318.6,130693.8,3
11349,181,2214876.1,113678.7,3
11359,192,1413812.4,68376.4,1
11362,220,858293.2,56055.6,1


In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits.mplot3d as p3d

# 使用魔法指令，指定本次绘图时，将图形画到matplotlib的专门绘图窗口中
%matplotlib auto

#
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams['axes.unicode_minus'] = False
# 绘制三维图形
fig = plt.figure(figsize=(10,10))
ax = p3d.Axes3D(fig)

# 指定三个坐标轴的数据内容
x = df2['交易类型']
y = df2['金额']
z = df2['账户余额']

#指定三个坐标的数据内容
ax.set_xlabel("交易次数")
ax.set_ylabel("总交易额")
ax.set_zlabel("账户余额")

# 根据“分组结果“创建一个列表，根据每条数据的分组编号设定其显示颜色
colors= ['blue','red','green','yellow']
c = [colors[i] for i in df2['分组结果']]

ax.scatter(x,y,z,color=c)

plt.show()


Using matplotlib backend: MacOSX
