In [72]:
import pandas as pd  
import numpy as np 
from collections import Counter

# 加载数据

In [73]:
data = pd.read_csv('bank.csv', sep = ';')
columns = ['job', 'marital','education', 'default', 'housing', 'loan','contact','poutcome','response']

# 统计job marital education变量

In [74]:
##job
job_indicators=pd.get_dummies(data['job'],prefix='job')
data=data.join(job_indicators)
data['whitecollar']=data['job_management']+data['job_entrepreneur']+data['job_self-employed']
data['bluecollar']=data['job_blue-collar']+data['job_services']+data['job_technician']+data['job_housemaid']

##marital
marital_indicators=pd.get_dummies(data['marital'])
data=data.join(marital_indicators)

##education
education_indicators=pd.get_dummies(data['education'])
data=data.join(education_indicators)


# 选择用于聚类分析的数据

In [78]:
bank_selected=data[data['previous']==0]
data_for_clustering=pd.DataFrame(bank_selected,columns=['age','whitecollar','bluecollar','divorced','married','primary',
                                                        'secondary','tertiary'])
                                 
data_for_clustering_matrix=data_for_clustering.as_matrix()
data_for_clustering_matrix

array([[30,  0,  0, ...,  1,  0,  0],
       [30,  1,  0, ...,  0,  0,  1],
       [59,  0,  1, ...,  0,  1,  0],
       ...,
       [33,  0,  1, ...,  0,  1,  0],
       [57,  1,  0, ...,  0,  0,  1],
       [57,  0,  1, ...,  0,  1,  0]], dtype=int64)

In [79]:
bankfull=pd.DataFrame(bank_selected,columns=['response','age',
                    'whitecollar','bluecollar','divorced','married','primary',
                    'secondary','tertiary'])    

# K-Means算法聚类

In [80]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

### 选取不同的K值进行聚类查看效果

In [81]:

##选取2到20不同的K值
for k in range(2,21):
    print("k: "+str(k))
    y_pred = KMeans(n_clusters=k, random_state=9999).fit_predict(data_for_clustering_matrix)
    silhouette_avg = silhouette_score(data_for_clustering_matrix, y_pred)
    print(silhouette_avg)

k: 2
0.601534349147598
k: 3
0.5299715327937059
k: 4
0.4713176700211876
k: 5
0.47726254626141335
k: 6
0.4664339177454313
k: 7
0.44494542752512234
k: 8
0.42324257583348124
k: 9
0.40344867023527364
k: 10
0.37844375945020015
k: 11
0.3789253996697508
k: 12
0.35099118238440274
k: 13
0.34618101513667154
k: 14
0.31782823152947676
k: 15
0.30344649329597756
k: 16
0.30838392737406234
k: 17
0.29564932476744477
k: 18
0.2829007728090323
k: 19
0.2731624736581588
k: 20
0.2843265661311533


### 选取k=2进行聚类

In [83]:
y_predict = KMeans(n_clusters=2, random_state=9999).fit_predict(data_for_clustering_matrix)
bankfull['cluster']=y_predict
bankfull

Unnamed: 0,response,age,whitecollar,bluecollar,divorced,married,primary,secondary,tertiary,cluster
0,no,30,0,0,0,1,1,0,0,0
3,no,30,1,0,0,1,0,0,1,0
4,no,59,0,1,0,1,0,1,0,1
7,no,39,0,1,0,1,0,1,0,0
8,no,41,1,0,0,1,0,0,1,0
10,no,39,0,1,0,1,0,1,0,0
11,no,43,0,0,0,1,0,1,0,0
12,no,36,0,1,0,1,0,0,1,0
13,yes,20,0,0,0,0,0,1,0,0
15,no,40,1,0,0,1,0,0,1,0


In [84]:
segments=bankfull.groupby('cluster')
segments.describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,bluecollar,bluecollar,...,tertiary,tertiary,whitecollar,whitecollar,whitecollar,whitecollar,whitecollar,whitecollar,whitecollar,whitecollar
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,2303.0,34.293964,5.064226,19.0,31.0,34.0,38.0,43.0,2303.0,0.531481,...,1.0,1.0,2303.0,0.298741,0.457805,0.0,0.0,0.0,1.0,1.0
1,1402.0,52.236805,6.489732,44.0,47.0,51.0,56.0,87.0,1402.0,0.46077,...,0.0,1.0,1402.0,0.28174,0.450008,0.0,0.0,0.0,1.0,1.0
